feat: split unicode properly

This commit is contained in:
Sidharth Vinod 2023-06-09 16:48:30 +05:30
parent c41df420d7
commit 0a437f5800
No known key found for this signature in database
GPG Key ID: FB5CCD378D3907CD
2 changed files with 63 additions and 105 deletions

View File

@ -1,4 +1,4 @@
import { splitTextToChars, splitLineToFitWidthLoop, type CheckFitFunction } from './splitText.js'; import { splitTextToChars, splitLineToFitWidth, type CheckFitFunction } from './splitText.js';
import { describe, it, expect } from 'vitest'; import { describe, it, expect } from 'vitest';
describe('splitText', () => { describe('splitText', () => {
@ -6,6 +6,7 @@ describe('splitText', () => {
{ str: '', split: [] }, { str: '', split: [] },
{ str: '🏳️‍⚧️🏳️‍🌈👩🏾‍❤️‍👨🏻', split: ['🏳️‍⚧️', '🏳️‍🌈', '👩🏾‍❤️‍👨🏻'] }, { str: '🏳️‍⚧️🏳️‍🌈👩🏾‍❤️‍👨🏻', split: ['🏳️‍⚧️', '🏳️‍🌈', '👩🏾‍❤️‍👨🏻'] },
{ str: 'ok', split: ['o', 'k'] }, { str: 'ok', split: ['o', 'k'] },
{ str: 'abc', split: ['a', 'b', 'c'] },
])('should split $str into graphemes', ({ str, split }: { str: string; split: string[] }) => { ])('should split $str into graphemes', ({ str, split }: { str: string; split: string[] }) => {
expect(splitTextToChars(str)).toEqual(split); expect(splitTextToChars(str)).toEqual(split);
}); });
@ -31,7 +32,7 @@ describe('split lines', () => {
const checkFn: CheckFitFunction = (text: string) => { const checkFn: CheckFitFunction = (text: string) => {
return splitTextToChars(text).length <= width; return splitTextToChars(text).length <= width;
}; };
expect(splitLineToFitWidthLoop(str.split(' '), checkFn)).toEqual(split); expect(splitLineToFitWidth(str, checkFn)).toEqual(split);
} }
); );
}); });

View File

@ -10,126 +10,83 @@ export function splitTextToChars(text: string): string[] {
return [...text]; return [...text];
} }
export function splitWordToFitWidth(checkFit: CheckFitFunction, word: string): string[] { /**
console.error('splitWordToFitWidth', word); * Splits a string into words.
const characters = splitTextToChars(word); */
if (characters.length === 0) { function splitLineToWords(text: string): string[] {
return []; if (Intl.Segmenter) {
return [...new Intl.Segmenter(undefined, { granularity: 'word' }).segment(text)]
.map((s) => s.segment)
.filter((word) => word !== ' ');
} }
const newWord = []; return text.split(' ');
let lastCheckedCharacter = '';
while (characters.length > 0) {
lastCheckedCharacter = characters.shift() ?? ' ';
if (checkFit([...newWord, lastCheckedCharacter].join(''))) {
newWord.push(lastCheckedCharacter);
} else if (newWord.length === 0) {
// Even the first character was too long, we cannot split it, so return it as is.
// This is an edge case that can happen when the first character is a long grapheme.
return [lastCheckedCharacter, characters.join('')];
} else {
// The last character was too long, so we need to put it back and return the rest.
characters.unshift(lastCheckedCharacter);
break;
}
}
if (characters.length === 0) {
return [newWord.join('')];
}
console.error({ newWord, characters });
return [newWord.join(''), ...splitWordToFitWidth(checkFit, characters.join(''))];
} }
export function splitWordToFitWidth2(checkFit: CheckFitFunction, word: string): [string, string] { /**
console.error('splitWordToFitWidth2', word); * Splits a word into two parts, the first part fits the width and the remaining part.
* @param checkFit - Function to check if word fits
* @param word - Word to split
* @returns [first part of word that fits, rest of word]
*/
export function splitWordToFitWidth(checkFit: CheckFitFunction, word: string): [string, string] {
const characters = splitTextToChars(word); const characters = splitTextToChars(word);
if (characters.length === 0) { if (characters.length === 0) {
return ['', '']; return ['', ''];
} }
const newWord = []; return splitWordToFitWidthRecursion(checkFit, [], characters);
let lastCheckedCharacter = '';
while (characters.length > 0) {
lastCheckedCharacter = characters.shift() ?? ' ';
if (checkFit([...newWord, lastCheckedCharacter].join(''))) {
newWord.push(lastCheckedCharacter);
} else if (newWord.length === 0) {
// Even the first character was too long, we cannot split it, so return it as is.
// This is an edge case that can happen when the first character is a long grapheme.
return [lastCheckedCharacter, characters.join('')];
} else {
// The last character was too long, so we need to put it back and return the rest.
characters.unshift(lastCheckedCharacter);
break;
}
}
console.error({ newWord, characters });
return [newWord.join(''), characters.join('')];
} }
export function splitLineToFitWidth( function splitWordToFitWidthRecursion(
checkFit: CheckFitFunction,
usedChars: string[],
remainingChars: string[]
): [string, string] {
if (remainingChars.length === 0) {
return [usedChars.join(''), ''];
}
const [nextChar, ...rest] = remainingChars;
const newWord = [...usedChars, nextChar];
if (checkFit(newWord.join(''))) {
return splitWordToFitWidthRecursion(checkFit, newWord, rest);
}
return [usedChars.join(''), remainingChars.join('')];
}
export function splitLineToFitWidth(line: string, checkFit: CheckFitFunction): string[] {
return splitLineToFitWidthRecursion(splitLineToWords(line), checkFit);
}
function splitLineToFitWidthRecursion(
words: string[], words: string[],
checkFit: CheckFitFunction, checkFit: CheckFitFunction,
lines: string[] = [], lines: string[] = [],
popped: string[] = [] newLine = ''
): string[] { ): string[] {
console.error('splitLineToFitWidth', { words, lines, popped });
// Return if there is nothing left to split // Return if there is nothing left to split
if (words.length === 0 && popped.length === 0) {
return lines;
}
const remainingText = words.join(' ');
if (checkFit(remainingText)) {
lines.push(remainingText);
words = [...popped];
}
if (words.length > 1) {
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
popped.unshift(words.pop()!);
return splitLineToFitWidth(words, checkFit, lines, popped);
} else if (words.length === 1) {
const [word, rest] = splitWordToFitWidth(checkFit, words[0]);
lines.push(word);
console.error({ word, rest });
if (rest) {
return splitLineToFitWidth([rest], checkFit, lines, []);
}
}
return lines;
}
export function splitLineToFitWidthLoop(words: string[], checkFit: CheckFitFunction): string[] {
console.error('splitLineToFitWidthLoop', { words });
if (words.length === 0) { if (words.length === 0) {
return []; // If there is a new line, add it to the lines
if (newLine.length > 0) {
lines.push(newLine);
}
return lines.length > 0 ? lines : [''];
}
const nextWord = words.shift() ?? ' ';
const lineWithNextWord = newLine ? `${newLine} ${nextWord}` : nextWord;
if (checkFit(lineWithNextWord)) {
// nextWord fits, so we can add it to the new line and continue
return splitLineToFitWidthRecursion(words, checkFit, lines, lineWithNextWord);
} }
const lines: string[] = []; // nextWord doesn't fit, so we need to split it
let newLine: string[] = [];
let lastCheckedWord = '';
while (words.length > 0) {
lastCheckedWord = words.shift() ?? ' ';
console.error({ lastCheckedWord, words });
if (checkFit([...newLine, lastCheckedWord].join(' '))) {
newLine.push(lastCheckedWord);
} else {
console.error({ newLine });
if (newLine.length === 0) {
const [word, rest] = splitWordToFitWidth2(checkFit, lastCheckedWord);
console.error({ word, rest });
lines.push(word);
if (rest) {
words.unshift(rest);
}
} else {
words.unshift(lastCheckedWord);
lines.push(newLine.join(' '));
newLine = [];
}
}
console.error({ newLine, lastCheckedWord, words, lines });
}
if (newLine.length > 0) { if (newLine.length > 0) {
lines.push(newLine.join(' ')); // There was text in newLine, so add it to lines and push nextWord back into words.
lines.push(newLine);
words.unshift(nextWord);
} else {
// There was no text in newLine, so we need to split nextWord
const [line, rest] = splitWordToFitWidth(checkFit, nextWord);
lines.push(line);
words.unshift(rest);
} }
console.error({ newLine, lastCheckedWord, words, lines }); return splitLineToFitWidthRecursion(words, checkFit, lines);
return lines;
} }