From 0a437f5800081d3fd0f1d047d7d801509fdbc885 Mon Sep 17 00:00:00 2001 From: Sidharth Vinod Date: Fri, 9 Jun 2023 16:48:30 +0530 Subject: [PATCH] feat: split unicode properly --- .../src/rendering-util/splitText.spec.ts | 5 +- .../mermaid/src/rendering-util/splitText.ts | 163 +++++++----------- 2 files changed, 63 insertions(+), 105 deletions(-) diff --git a/packages/mermaid/src/rendering-util/splitText.spec.ts b/packages/mermaid/src/rendering-util/splitText.spec.ts index 6444627d2..77bd6102c 100644 --- a/packages/mermaid/src/rendering-util/splitText.spec.ts +++ b/packages/mermaid/src/rendering-util/splitText.spec.ts @@ -1,4 +1,4 @@ -import { splitTextToChars, splitLineToFitWidthLoop, type CheckFitFunction } from './splitText.js'; +import { splitTextToChars, splitLineToFitWidth, type CheckFitFunction } from './splitText.js'; import { describe, it, expect } from 'vitest'; describe('splitText', () => { @@ -6,6 +6,7 @@ describe('splitText', () => { { str: '', split: [] }, { str: '🏳️‍⚧️🏳️‍🌈👩🏾‍❤️‍👨🏻', split: ['🏳️‍⚧️', '🏳️‍🌈', '👩🏾‍❤️‍👨🏻'] }, { str: 'ok', split: ['o', 'k'] }, + { str: 'abc', split: ['a', 'b', 'c'] }, ])('should split $str into graphemes', ({ str, split }: { str: string; split: string[] }) => { expect(splitTextToChars(str)).toEqual(split); }); @@ -31,7 +32,7 @@ describe('split lines', () => { const checkFn: CheckFitFunction = (text: string) => { return splitTextToChars(text).length <= width; }; - expect(splitLineToFitWidthLoop(str.split(' '), checkFn)).toEqual(split); + expect(splitLineToFitWidth(str, checkFn)).toEqual(split); } ); }); diff --git a/packages/mermaid/src/rendering-util/splitText.ts b/packages/mermaid/src/rendering-util/splitText.ts index de71fdafd..b8ee7a1b0 100644 --- a/packages/mermaid/src/rendering-util/splitText.ts +++ b/packages/mermaid/src/rendering-util/splitText.ts @@ -10,126 +10,83 @@ export function splitTextToChars(text: string): string[] { return [...text]; } -export function splitWordToFitWidth(checkFit: CheckFitFunction, word: string): string[] { - console.error('splitWordToFitWidth', word); - const characters = splitTextToChars(word); - if (characters.length === 0) { - return []; +/** + * Splits a string into words. + */ +function splitLineToWords(text: string): string[] { + if (Intl.Segmenter) { + return [...new Intl.Segmenter(undefined, { granularity: 'word' }).segment(text)] + .map((s) => s.segment) + .filter((word) => word !== ' '); } - const newWord = []; - let lastCheckedCharacter = ''; - while (characters.length > 0) { - lastCheckedCharacter = characters.shift() ?? ' '; - if (checkFit([...newWord, lastCheckedCharacter].join(''))) { - newWord.push(lastCheckedCharacter); - } else if (newWord.length === 0) { - // Even the first character was too long, we cannot split it, so return it as is. - // This is an edge case that can happen when the first character is a long grapheme. - return [lastCheckedCharacter, characters.join('')]; - } else { - // The last character was too long, so we need to put it back and return the rest. - characters.unshift(lastCheckedCharacter); - break; - } - } - if (characters.length === 0) { - return [newWord.join('')]; - } - console.error({ newWord, characters }); - return [newWord.join(''), ...splitWordToFitWidth(checkFit, characters.join(''))]; + return text.split(' '); } -export function splitWordToFitWidth2(checkFit: CheckFitFunction, word: string): [string, string] { - console.error('splitWordToFitWidth2', word); +/** + * Splits a word into two parts, the first part fits the width and the remaining part. + * @param checkFit - Function to check if word fits + * @param word - Word to split + * @returns [first part of word that fits, rest of word] + */ +export function splitWordToFitWidth(checkFit: CheckFitFunction, word: string): [string, string] { const characters = splitTextToChars(word); if (characters.length === 0) { return ['', '']; } - const newWord = []; - let lastCheckedCharacter = ''; - while (characters.length > 0) { - lastCheckedCharacter = characters.shift() ?? ' '; - if (checkFit([...newWord, lastCheckedCharacter].join(''))) { - newWord.push(lastCheckedCharacter); - } else if (newWord.length === 0) { - // Even the first character was too long, we cannot split it, so return it as is. - // This is an edge case that can happen when the first character is a long grapheme. - return [lastCheckedCharacter, characters.join('')]; - } else { - // The last character was too long, so we need to put it back and return the rest. - characters.unshift(lastCheckedCharacter); - break; - } - } - console.error({ newWord, characters }); - return [newWord.join(''), characters.join('')]; + return splitWordToFitWidthRecursion(checkFit, [], characters); } -export function splitLineToFitWidth( +function splitWordToFitWidthRecursion( + checkFit: CheckFitFunction, + usedChars: string[], + remainingChars: string[] +): [string, string] { + if (remainingChars.length === 0) { + return [usedChars.join(''), '']; + } + const [nextChar, ...rest] = remainingChars; + const newWord = [...usedChars, nextChar]; + if (checkFit(newWord.join(''))) { + return splitWordToFitWidthRecursion(checkFit, newWord, rest); + } + return [usedChars.join(''), remainingChars.join('')]; +} + +export function splitLineToFitWidth(line: string, checkFit: CheckFitFunction): string[] { + return splitLineToFitWidthRecursion(splitLineToWords(line), checkFit); +} + +function splitLineToFitWidthRecursion( words: string[], checkFit: CheckFitFunction, lines: string[] = [], - popped: string[] = [] + newLine = '' ): string[] { - console.error('splitLineToFitWidth', { words, lines, popped }); // Return if there is nothing left to split - if (words.length === 0 && popped.length === 0) { - return lines; - } - const remainingText = words.join(' '); - if (checkFit(remainingText)) { - lines.push(remainingText); - words = [...popped]; - } - if (words.length > 1) { - // eslint-disable-next-line @typescript-eslint/no-non-null-assertion - popped.unshift(words.pop()!); - return splitLineToFitWidth(words, checkFit, lines, popped); - } else if (words.length === 1) { - const [word, rest] = splitWordToFitWidth(checkFit, words[0]); - lines.push(word); - console.error({ word, rest }); - if (rest) { - return splitLineToFitWidth([rest], checkFit, lines, []); - } - } - return lines; -} - -export function splitLineToFitWidthLoop(words: string[], checkFit: CheckFitFunction): string[] { - console.error('splitLineToFitWidthLoop', { words }); if (words.length === 0) { - return []; + // If there is a new line, add it to the lines + if (newLine.length > 0) { + lines.push(newLine); + } + return lines.length > 0 ? lines : ['']; + } + const nextWord = words.shift() ?? ' '; + const lineWithNextWord = newLine ? `${newLine} ${nextWord}` : nextWord; + if (checkFit(lineWithNextWord)) { + // nextWord fits, so we can add it to the new line and continue + return splitLineToFitWidthRecursion(words, checkFit, lines, lineWithNextWord); } - const lines: string[] = []; - let newLine: string[] = []; - let lastCheckedWord = ''; - while (words.length > 0) { - lastCheckedWord = words.shift() ?? ' '; - console.error({ lastCheckedWord, words }); - if (checkFit([...newLine, lastCheckedWord].join(' '))) { - newLine.push(lastCheckedWord); - } else { - console.error({ newLine }); - if (newLine.length === 0) { - const [word, rest] = splitWordToFitWidth2(checkFit, lastCheckedWord); - console.error({ word, rest }); - lines.push(word); - if (rest) { - words.unshift(rest); - } - } else { - words.unshift(lastCheckedWord); - lines.push(newLine.join(' ')); - newLine = []; - } - } - console.error({ newLine, lastCheckedWord, words, lines }); - } + // nextWord doesn't fit, so we need to split it if (newLine.length > 0) { - lines.push(newLine.join(' ')); + // There was text in newLine, so add it to lines and push nextWord back into words. + lines.push(newLine); + words.unshift(nextWord); + } else { + // There was no text in newLine, so we need to split nextWord + const [line, rest] = splitWordToFitWidth(checkFit, nextWord); + lines.push(line); + words.unshift(rest); } - console.error({ newLine, lastCheckedWord, words, lines }); - return lines; + return splitLineToFitWidthRecursion(words, checkFit, lines); }