Skip to content

Commit

Permalink
Merge pull request #46 from KentoMoriwaki/emoji2
Browse files Browse the repository at this point in the history
Simpler handling surrogate pairs
  • Loading branch information
dmonad committed Mar 17, 2023
2 parents 71fc525 + a9f3a66 commit a339075
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 3 deletions.
12 changes: 12 additions & 0 deletions diff.js
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ import { equalityStrict } from './function.js'
* @template T
*/

const highSurrogateRegex = /[\uD800-\uDBFF]/
const lowSurrogateRegex = /[\uDC00-\uDFFF]/

/**
* Create a diff between two strings. This diff implementation is highly
* efficient, but not very sophisticated.
Expand All @@ -44,9 +47,13 @@ export const simpleDiffString = (a, b) => {
while (left < a.length && left < b.length && a[left] === b[left]) {
left++
}
// If the last same character is a high surrogate, we need to rollback to the previous character
if (highSurrogateRegex.test(a[left - 1])) left--
while (right + left < a.length && right + left < b.length && a[a.length - right - 1] === b[b.length - right - 1]) {
right++
}
// If the last same character is a low surrogate, we need to rollback to the previous character
if (lowSurrogateRegex.test(a[a.length - right])) right--
return {
index: left,
remove: a.length - left - right,
Expand Down Expand Up @@ -111,6 +118,8 @@ export const simpleDiffStringWithCursor = (a, b, cursor) => {
) {
left++
}
// If the last same character is a high surrogate, we need to rollback to the previous character
if (highSurrogateRegex.test(a[left - 1])) left--
// Iterate right to the left until we find a changed character
while (
right + left < a.length &&
Expand All @@ -119,6 +128,8 @@ export const simpleDiffStringWithCursor = (a, b, cursor) => {
) {
right++
}
// If the last same character is a low surrogate, we need to rollback to the previous character
if (lowSurrogateRegex.test(a[a.length - right])) right--
// Try to iterate left further to the right without caring about the current cursor position
while (
right + left < a.length &&
Expand All @@ -127,6 +138,7 @@ export const simpleDiffStringWithCursor = (a, b, cursor) => {
) {
left++
}
if (highSurrogateRegex.test(a[left - 1])) left--
return {
index: left,
remove: a.length - left - right,
Expand Down
28 changes: 25 additions & 3 deletions diff.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ import { simpleDiffString, simpleDiffArray, simpleDiffStringWithCursor } from '.
import * as prng from './prng.js'
import * as f from './function.js'
import * as t from './testing.js'
import * as object from './object.js'
import * as str from './string.js'

/**
Expand All @@ -14,8 +13,12 @@ function runDiffTest (a, b, expected) {
const result = simpleDiffString(a, b)
t.compare(result, expected)
t.compare(result, simpleDiffStringWithCursor(a, b, a.length)) // check that the withCursor approach returns the same result
const arrResult = simpleDiffArray(a.split(''), b.split(''))
t.compare(arrResult, object.assign({}, result, { insert: result.insert.split('') }))
const recomposed = str.splice(a, result.index, result.remove, result.insert)
t.compareStrings(recomposed, b)
const arrResult = simpleDiffArray(Array.from(a), Array.from(b))
const arrRecomposed = Array.from(a)
arrRecomposed.splice(arrResult.index, arrResult.remove, ...arrResult.insert)
t.compareStrings(arrRecomposed.join(''), b)
}

/**
Expand All @@ -30,6 +33,11 @@ export const testDiffing = tc => {
runDiffTest('abc', 'xyz', { index: 0, remove: 3, insert: 'xyz' })
runDiffTest('axz', 'au', { index: 1, remove: 2, insert: 'u' })
runDiffTest('ax', 'axy', { index: 2, remove: 0, insert: 'y' })
// These strings share high-surrogate characters
runDiffTest('\u{d83d}\u{dc77}'/* 'πŸ‘·' */, '\u{d83d}\u{dea7}\u{d83d}\u{dc77}'/* 'πŸš§πŸ‘·' */, { index: 0, remove: 0, insert: '🚧' })
runDiffTest('\u{d83d}\u{dea7}\u{d83d}\u{dc77}'/* 'πŸš§πŸ‘·' */, '\u{d83d}\u{dc77}'/* 'πŸ‘·' */, { index: 0, remove: 2, insert: '' })
// These strings share low-surrogate characters
runDiffTest('\u{d83d}\u{dfe6}\u{d83d}\u{dfe6}'/* '🟦🟦' */, '\u{d83c}\u{dfe6}\u{d83d}\u{dfe6}'/* '🏦🟦' */, { index: 0, remove: 2, insert: '🏦' })
}

/**
Expand Down Expand Up @@ -74,6 +82,20 @@ export const testSimpleDiffWithCursor = tc => {
const recomposed = str.splice(initial, change.index, change.remove, change.insert)
t.compareStrings(expected, recomposed)
}
{
const initial = '🚧🚧🚧'
const change = simpleDiffStringWithCursor(initial, '🚧🚧', 2) // Should delete after the midst of 🚧
t.compare(change, { insert: '', remove: 2, index: 2 })
const recomposed = str.splice(initial, change.index, change.remove, change.insert)
t.compareStrings('🚧🚧', recomposed)
}
{
const initial = 'πŸš§πŸ‘·πŸš§πŸ‘·'
const change = simpleDiffStringWithCursor(initial, '🚧🚧', 2) // Should delete after the first 🚧 and insert 🚧
t.compare(change, { insert: '🚧', remove: 6, index: 2 })
const recomposed = str.splice(initial, change.index, change.remove, change.insert)
t.compareStrings('🚧🚧', recomposed)
}
}

/**
Expand Down

0 comments on commit a339075

Please sign in to comment.