jenkins-bot has submitted this change and it was merged. Change subject: Store multi byte characters as one element ......................................................................
Store multi byte characters as one element Getting & setting the cursor is done with byte offsets instead of data model offset (characters) so we need to be able to convert between the two as well as splitting characters. TODO: The regex only works on surrogate pairs, not yet combining accents. fixupInsertion will combine a combining mark with the character to its left it it can. Bug: 48630 Change-Id: I8d936fb15d82f73cd45fac142c540a7950850d55 --- M demos/ve/index.php A demos/ve/pages/multibyte.html M modules/ve/ce/ve.ce.Document.js M modules/ve/ce/ve.ce.Surface.js M modules/ve/ce/ve.ce.SurfaceObserver.js M modules/ve/ce/ve.ce.js M modules/ve/dm/lineardata/ve.dm.ElementLinearData.js M modules/ve/dm/ve.dm.Converter.js M modules/ve/dm/ve.dm.Document.js M modules/ve/dm/ve.dm.SurfaceFragment.js M modules/ve/dm/ve.dm.Transaction.js M modules/ve/test/dm/ve.dm.Converter.test.js M modules/ve/ve.js 13 files changed, 112 insertions(+), 24 deletions(-) Approvals: Catrope: Looks good to me, approved jenkins-bot: Verified diff --git a/demos/ve/index.php b/demos/ve/index.php index 15b15c3..cfec056 100644 --- a/demos/ve/index.php +++ b/demos/ve/index.php @@ -403,7 +403,7 @@ $label.addClass( 've-demo-dump-element' ); text = element.type; annotations = element.annotations; - } else if ( element.length > 1 ){ + } else if ( ve.isArray( element ) ){ $label.addClass( 've-demo-dump-achar' ); text = element[0]; annotations = element[1]; diff --git a/demos/ve/pages/multibyte.html b/demos/ve/pages/multibyte.html new file mode 100644 index 0000000..e488ec1 --- /dev/null +++ b/demos/ve/pages/multibyte.html @@ -0,0 +1,6 @@ +<p>12𨋢456789𨋢bc</p> +<p>「𨋢」字響<tt>香港</tt>衍生出好多新詞,好似:𨋢</p> +<p>abc</p> +<p>one c̀ombining accent</p> +<p>two ç̀ombining accents</p> +<p>def</p> \ No newline at end of file diff --git a/modules/ve/ce/ve.ce.Document.js b/modules/ve/ce/ve.ce.Document.js index 6b3f67e..1625cf9 100644 --- a/modules/ve/ce/ve.ce.Document.js +++ b/modules/ve/ce/ve.ce.Document.js @@ -125,7 +125,7 @@ * @method * @param {number} offset Linear model offset * @returns {Object} Object containing a node and offset property where node is an HTML element and - * offset is the position within the element + * offset is the byte position within the element * @throws {Error} Offset could not be translated to a DOM element and offset */ ve.ce.Document.prototype.getNodeAndOffset = function ( offset ) { @@ -150,7 +150,7 @@ if ( offset >= startOffset && offset <= startOffset + length ) { return { node: item, - offset: offset - startOffset + offset: ve.getByteOffset( item.textContent, offset - startOffset ) }; } else { startOffset += length; diff --git a/modules/ve/ce/ve.ce.Surface.js b/modules/ve/ce/ve.ce.Surface.js index 486a18c..c55e699 100644 --- a/modules/ve/ce/ve.ce.Surface.js +++ b/modules/ve/ce/ve.ce.Surface.js @@ -593,7 +593,7 @@ pasteData = view.clipboard[key]; } else { pasteText = view.$pasteTarget.text().replace( /\n/gm, ''); - pasteData = new ve.dm.DocumentSlice( pasteText.split( '' ) ); + pasteData = new ve.dm.DocumentSlice( ve.splitCharacters( pasteText ) ); } // Transact @@ -762,10 +762,10 @@ // Simple insertion if ( lengthDiff > 0 && offsetDiff === lengthDiff /* && sameLeadingAndTrailing */) { - data = next.text.substring( + data = ve.splitCharacters( next.text.substring( previous.range.start - nodeOffset - 1, next.range.start - nodeOffset - 1 - ).split( '' ); + ) ); // Apply insertion annotations annotations = this.model.getInsertionAnnotations(); if ( annotations instanceof ve.dm.AnnotationSet ) { @@ -814,7 +814,7 @@ ) { ++fromRight; } - data = next.text.substring( fromLeft, next.text.length - fromRight ).split( '' ); + data = ve.splitCharacters( next.text.substring( fromLeft, next.text.length - fromRight ) ); // Get annotations to the left of new content and apply annotations = this.model.getDocument().data.getAnnotationsFromOffset( nodeOffset + 1 + fromLeft ); @@ -1199,8 +1199,8 @@ */ ve.ce.Surface.prototype.handleDelete = function ( e, backspace ) { var sourceOffset, targetOffset, sourceSplitableNode, targetSplitableNode, tx, cursorAt, - sourceNode, targetNode, sourceData, nodeToDelete, adjacentData, adjacentText, - adjacentTextAfterMatch, endOffset, i, containsInlineElements = false, + sourceNode, targetNode, sourceData, nodeToDelete, adjacentData, adjacentText, adjacentChar, + adjacentTextAfterMatch, endOffset, i, containsComplexElements = false, selection = this.model.getSelection(); if ( selection.isCollapsed() ) { @@ -1257,13 +1257,18 @@ for ( i = 0; i < adjacentData.length; i++ ) { if ( adjacentData[i].type !== undefined ) { - containsInlineElements = true; + containsComplexElements = true; break; } - adjacentText += adjacentData[i][0]; + adjacentChar = ve.isArray( adjacentData[i] ) ? adjacentData[i][0] : adjacentData[i]; + if ( adjacentChar.length > 1 ) { + containsComplexElements = true; + break; + } + adjacentText += adjacentChar; } - if ( !containsInlineElements ) { + if ( !containsComplexElements ) { adjacentTextAfterMatch = adjacentText.match( this.constructor.static.textPattern ); // If there are "normal" characters in the adjacent text let the browser handle natively if ( adjacentTextAfterMatch !== null && adjacentTextAfterMatch.length ) { diff --git a/modules/ve/ce/ve.ce.SurfaceObserver.js b/modules/ve/ce/ve.ce.SurfaceObserver.js index a92b4a8..69b0cf3 100644 --- a/modules/ve/ce/ve.ce.SurfaceObserver.js +++ b/modules/ve/ce/ve.ce.SurfaceObserver.js @@ -154,7 +154,7 @@ node = this.node; rangyRange = ve.ce.DomRange.newFromDomSelection( rangy.getSelection() ); - if ( !rangyRange.equals( this.rangyRange ) ){ + if ( !rangyRange.equals( this.rangyRange ) ) { this.rangyRange = rangyRange; node = null; $nodeOrSlug = $( rangyRange.anchorNode ).closest( '.ve-ce-branchNode, .ve-ce-branchNode-slug' ); diff --git a/modules/ve/ce/ve.ce.js b/modules/ve/ce/ve.ce.js index fcb0dbd..f557f40 100644 --- a/modules/ve/ce/ve.ce.js +++ b/modules/ve/ce/ve.ce.js @@ -157,10 +157,11 @@ item = current[0][current[1]]; if ( item.nodeType === Node.TEXT_NODE ) { if ( item === domNode ) { - offset += domOffset; + // domOffset is a byte offset, convert it to a character offset + offset += ve.getCharacterOffset( item.textContent, domOffset ); break; } else { - offset += item.textContent.length; + offset += ve.getCharacterOffset( item.textContent, item.textContent.length ); } } else if ( item.nodeType === Node.ELEMENT_NODE ) { $item = current[0].eq( current[1] ); @@ -173,9 +174,9 @@ } else if ( $item.hasClass( 've-ce-branchNode' ) ) { offset += $item.data( 'view' ).getOuterLength(); } else { - stack.push( [$item.contents(), 0 ] ); + stack.push( [ $item.contents(), 0 ] ); current[1]++; - current = stack[stack.length-1]; + current = stack[ stack.length - 1 ]; continue; } } diff --git a/modules/ve/dm/lineardata/ve.dm.ElementLinearData.js b/modules/ve/dm/lineardata/ve.dm.ElementLinearData.js index c6c83de..0f68431 100644 --- a/modules/ve/dm/lineardata/ve.dm.ElementLinearData.js +++ b/modules/ve/dm/lineardata/ve.dm.ElementLinearData.js @@ -320,7 +320,11 @@ element = this.getData( offset ); } - return element.annotations || element[1] || []; + if ( typeof element === 'string' ) { + return []; + } else { + return element.annotations || element[1]; + } }; /** diff --git a/modules/ve/dm/ve.dm.Converter.js b/modules/ve/dm/ve.dm.Converter.js index acdf97e..6f51ae6 100644 --- a/modules/ve/dm/ve.dm.Converter.js +++ b/modules/ve/dm/ve.dm.Converter.js @@ -41,7 +41,7 @@ */ ve.dm.Converter.getDataContentFromText = function ( text, annotations ) { var i, len, - characters = text.split( '' ); + characters = ve.splitCharacters( text ); if ( !annotations || annotations.isEmpty() ) { return characters; diff --git a/modules/ve/dm/ve.dm.Document.js b/modules/ve/dm/ve.dm.Document.js index 859e9f3..8c0da12 100644 --- a/modules/ve/dm/ve.dm.Document.js +++ b/modules/ve/dm/ve.dm.Document.js @@ -480,12 +480,19 @@ * @method * @param {Array} data Snippet of linear model data to insert * @param {number} offset Offset in the linear model where the caller wants to insert data - * @returns {Object} A (possibly modified) copy of data and a (possibly modified) offset + * @returns {Object} A (possibly modified) copy of data, a (possibly modified) offset + * and a number of elements to remove */ ve.dm.Document.prototype.fixupInsertion = function ( data, offset ) { var // Array where we build the return value newData = [], + + // Temporary variables for handling combining marks + insert, annotations, + // An unattached combining mark may require the insertion to remove a character, + // so we send this counter back in the result + remove = 0, // *** Stacks *** // Array of element openings (object). Openings in data are pushed onto this stack @@ -705,6 +712,29 @@ } } while ( !childrenOK ); + if ( + i === 0 && + childType === 'text' && + ve.isUnattachedCombiningMark( data[i] ) + ) { + // Note we only need to check data[0] as combining marks further + // along should already have been merged + if ( doc.data.isElementData( offset - 1 ) ) { + // Inserting a unattached combining mark is generally pretty badly + // supported (browser rendering bugs), so we'll just prevent it. + continue; + } else { + offset--; + remove++; + insert = doc.data.getCharacterData( offset ) + data[i]; + annotations = doc.data.getAnnotationIndexesFromOffset( offset ); + if ( annotations.length ) { + insert = [ insert, annotations ]; + } + data[i] = insert; + } + } + for ( j = 0; j < closings.length; j++ ) { // writeElement() would update openingStack/closingStack, but we've already done // that for closings @@ -762,7 +792,8 @@ return { offset: offset, - data: newData + data: newData, + remove: remove }; }; diff --git a/modules/ve/dm/ve.dm.SurfaceFragment.js b/modules/ve/dm/ve.dm.SurfaceFragment.js index 96f93da..1654d1a 100644 --- a/modules/ve/dm/ve.dm.SurfaceFragment.js +++ b/modules/ve/dm/ve.dm.SurfaceFragment.js @@ -525,7 +525,7 @@ } // Auto-convert content to array of plain text characters if ( typeof content === 'string' ) { - content = content.split( '' ); + content = ve.splitCharacters( content ); } if ( content.length ) { if ( annotate ) { diff --git a/modules/ve/dm/ve.dm.Transaction.js b/modules/ve/dm/ve.dm.Transaction.js index df3c542..460e899 100644 --- a/modules/ve/dm/ve.dm.Transaction.js +++ b/modules/ve/dm/ve.dm.Transaction.js @@ -37,7 +37,7 @@ // Retain up to insertion point, if needed tx.pushRetain( insertion.offset ); // Insert data - tx.pushReplace( doc, offset, 0, insertion.data ); + tx.pushReplace( doc, insertion.offset, insertion.remove, insertion.data ); // Retain to end of document, if needed (for completeness) tx.pushRetain( data.length - insertion.offset ); return tx; diff --git a/modules/ve/test/dm/ve.dm.Converter.test.js b/modules/ve/test/dm/ve.dm.Converter.test.js index 675ebcb..7c8da34 100644 --- a/modules/ve/test/dm/ve.dm.Converter.test.js +++ b/modules/ve/test/dm/ve.dm.Converter.test.js @@ -100,7 +100,7 @@ store.index( cases[msg].storeItems[i].value, cases[msg].storeItems[i].hash ); } } - if( cases[msg].modify ) { + if ( cases[msg].modify ) { cases[msg].modify( cases[msg].data ); } doc = new ve.dm.Document( ve.dm.example.preprocessAnnotations( cases[msg].data, store ) ); diff --git a/modules/ve/ve.js b/modules/ve/ve.js index c95da7b..f491b96 100644 --- a/modules/ve/ve.js +++ b/modules/ve/ve.js @@ -815,6 +815,47 @@ }; /** + * Split a string into individual characters, leaving multibyte characters as one item + * + * @param {string} text Text to split + * @returns {string[]} Array of characters + */ + ve.splitCharacters = function ( text ) { + return text.split( /(?![\uDC00-\uDFFF\u0300-\u036F])/g ); // don't split UTF surrogate pairs + }; + + /** + * Determine if the text consists of only unattached combining marks + * @param {string} text Text to test + * @returns {boolean} The text is unattached combining marks + */ + ve.isUnattachedCombiningMark = function ( text ) { + return ( /^[\u0300-\u036F]+$/ ).test( text ); + }; + + /** + * Convert a character offset to a byte offset + * + * @param {string} text Text in which to calculate offset + * @param {number} characterOffset Character offset + * @returns {number} Byte offset + */ + ve.getByteOffset = function ( text, characterOffset ) { + return ve.splitCharacters( text ).slice( 0, characterOffset ).join( '' ).length; + }; + + /** + * Convert a byte offset to a character offset + * + * @param {string} text Text in which to calculate offset + * @param {number} byteOffset Byte offset + * @returns {number} Character offset + */ + ve.getCharacterOffset = function ( text, byteOffset ) { + return ve.splitCharacters( text.substring( 0, byteOffset ) ).length; + }; + + /** * Escapes non-word characters so they can be safely used as HTML attribute values. * * This method is basically a copy of mw.html.escape. -- To view, visit https://gerrit.wikimedia.org/r/64965 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I8d936fb15d82f73cd45fac142c540a7950850d55 Gerrit-PatchSet: 9 Gerrit-Project: mediawiki/extensions/VisualEditor Gerrit-Branch: master Gerrit-Owner: Esanders <esand...@wikimedia.org> Gerrit-Reviewer: Catrope <roan.katt...@gmail.com> Gerrit-Reviewer: Divec <da...@sheetmusic.org.uk> Gerrit-Reviewer: Esanders <esand...@wikimedia.org> Gerrit-Reviewer: jenkins-bot _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits