jenkins-bot has submitted this change and it was merged.

Change subject: Store multi byte characters as one element
......................................................................


Store multi byte characters as one element

Getting & setting the cursor is done with byte offsets
instead of data model offset (characters) so we need to
be able to convert between the two as well as splitting
characters.

TODO: The regex only works on surrogate pairs, not
yet combining accents.

fixupInsertion will combine a combining mark with the
character to its left it it can.

Bug: 48630

Change-Id: I8d936fb15d82f73cd45fac142c540a7950850d55
---
M demos/ve/index.php
A demos/ve/pages/multibyte.html
M modules/ve/ce/ve.ce.Document.js
M modules/ve/ce/ve.ce.Surface.js
M modules/ve/ce/ve.ce.SurfaceObserver.js
M modules/ve/ce/ve.ce.js
M modules/ve/dm/lineardata/ve.dm.ElementLinearData.js
M modules/ve/dm/ve.dm.Converter.js
M modules/ve/dm/ve.dm.Document.js
M modules/ve/dm/ve.dm.SurfaceFragment.js
M modules/ve/dm/ve.dm.Transaction.js
M modules/ve/test/dm/ve.dm.Converter.test.js
M modules/ve/ve.js
13 files changed, 112 insertions(+), 24 deletions(-)

Approvals:
  Catrope: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/demos/ve/index.php b/demos/ve/index.php
index 15b15c3..cfec056 100644
--- a/demos/ve/index.php
+++ b/demos/ve/index.php
@@ -403,7 +403,7 @@
                                                $label.addClass( 
've-demo-dump-element' );
                                                text = element.type;
                                                annotations = 
element.annotations;
-                                       } else if ( element.length > 1 ){
+                                       } else if ( ve.isArray( element ) ){
                                                $label.addClass( 
've-demo-dump-achar' );
                                                text = element[0];
                                                annotations = element[1];
diff --git a/demos/ve/pages/multibyte.html b/demos/ve/pages/multibyte.html
new file mode 100644
index 0000000..e488ec1
--- /dev/null
+++ b/demos/ve/pages/multibyte.html
@@ -0,0 +1,6 @@
+<p>12𨋢456789𨋢bc</p>
+<p>「𨋢」字響<tt>香港</tt>衍生出好多新詞,好似:𨋢</p>
+<p>abc</p>
+<p>one c̀ombining accent</p>
+<p>two ç̀ombining accents</p>
+<p>def</p>
\ No newline at end of file
diff --git a/modules/ve/ce/ve.ce.Document.js b/modules/ve/ce/ve.ce.Document.js
index 6b3f67e..1625cf9 100644
--- a/modules/ve/ce/ve.ce.Document.js
+++ b/modules/ve/ce/ve.ce.Document.js
@@ -125,7 +125,7 @@
  * @method
  * @param {number} offset Linear model offset
  * @returns {Object} Object containing a node and offset property where node 
is an HTML element and
- * offset is the position within the element
+ * offset is the byte position within the element
  * @throws {Error} Offset could not be translated to a DOM element and offset
  */
 ve.ce.Document.prototype.getNodeAndOffset = function ( offset ) {
@@ -150,7 +150,7 @@
                        if ( offset >= startOffset && offset <= startOffset + 
length ) {
                                return {
                                        node: item,
-                                       offset: offset - startOffset
+                                       offset: ve.getByteOffset( 
item.textContent, offset - startOffset )
                                };
                        } else {
                                startOffset += length;
diff --git a/modules/ve/ce/ve.ce.Surface.js b/modules/ve/ce/ve.ce.Surface.js
index 486a18c..c55e699 100644
--- a/modules/ve/ce/ve.ce.Surface.js
+++ b/modules/ve/ce/ve.ce.Surface.js
@@ -593,7 +593,7 @@
                        pasteData = view.clipboard[key];
                } else {
                        pasteText = view.$pasteTarget.text().replace( /\n/gm, 
'');
-                       pasteData = new ve.dm.DocumentSlice( pasteText.split( 
'' ) );
+                       pasteData = new ve.dm.DocumentSlice( 
ve.splitCharacters( pasteText ) );
                }
 
                // Transact
@@ -762,10 +762,10 @@
 
                // Simple insertion
                if ( lengthDiff > 0 && offsetDiff === lengthDiff /* && 
sameLeadingAndTrailing */) {
-                       data = next.text.substring(
+                       data = ve.splitCharacters( next.text.substring(
                                previous.range.start - nodeOffset - 1,
                                next.range.start - nodeOffset - 1
-                       ).split( '' );
+                       ) );
                        // Apply insertion annotations
                        annotations = this.model.getInsertionAnnotations();
                        if ( annotations instanceof ve.dm.AnnotationSet ) {
@@ -814,7 +814,7 @@
        ) {
                ++fromRight;
        }
-       data = next.text.substring( fromLeft, next.text.length - fromRight 
).split( '' );
+       data = ve.splitCharacters( next.text.substring( fromLeft, 
next.text.length - fromRight ) );
        // Get annotations to the left of new content and apply
        annotations =
                this.model.getDocument().data.getAnnotationsFromOffset( 
nodeOffset + 1 + fromLeft );
@@ -1199,8 +1199,8 @@
  */
 ve.ce.Surface.prototype.handleDelete = function ( e, backspace ) {
        var sourceOffset, targetOffset, sourceSplitableNode, 
targetSplitableNode, tx, cursorAt,
-               sourceNode, targetNode, sourceData, nodeToDelete, adjacentData, 
adjacentText,
-               adjacentTextAfterMatch, endOffset, i, containsInlineElements = 
false,
+               sourceNode, targetNode, sourceData, nodeToDelete, adjacentData, 
adjacentText, adjacentChar,
+               adjacentTextAfterMatch, endOffset, i, containsComplexElements = 
false,
                selection = this.model.getSelection();
 
        if ( selection.isCollapsed() ) {
@@ -1257,13 +1257,18 @@
 
                for ( i = 0; i < adjacentData.length; i++ ) {
                        if ( adjacentData[i].type !== undefined ) {
-                               containsInlineElements = true;
+                               containsComplexElements = true;
                                break;
                        }
-                       adjacentText += adjacentData[i][0];
+                       adjacentChar = ve.isArray( adjacentData[i] ) ? 
adjacentData[i][0] : adjacentData[i];
+                       if ( adjacentChar.length > 1 ) {
+                               containsComplexElements = true;
+                               break;
+                       }
+                       adjacentText += adjacentChar;
                }
 
-               if ( !containsInlineElements ) {
+               if ( !containsComplexElements ) {
                        adjacentTextAfterMatch = adjacentText.match( 
this.constructor.static.textPattern );
                        // If there are "normal" characters in the adjacent 
text let the browser handle natively
                        if ( adjacentTextAfterMatch !== null && 
adjacentTextAfterMatch.length ) {
diff --git a/modules/ve/ce/ve.ce.SurfaceObserver.js 
b/modules/ve/ce/ve.ce.SurfaceObserver.js
index a92b4a8..69b0cf3 100644
--- a/modules/ve/ce/ve.ce.SurfaceObserver.js
+++ b/modules/ve/ce/ve.ce.SurfaceObserver.js
@@ -154,7 +154,7 @@
        node = this.node;
        rangyRange = ve.ce.DomRange.newFromDomSelection( rangy.getSelection() );
 
-       if ( !rangyRange.equals( this.rangyRange ) ){
+       if ( !rangyRange.equals( this.rangyRange ) ) {
                this.rangyRange = rangyRange;
                node = null;
                $nodeOrSlug = $( rangyRange.anchorNode ).closest( 
'.ve-ce-branchNode, .ve-ce-branchNode-slug' );
diff --git a/modules/ve/ce/ve.ce.js b/modules/ve/ce/ve.ce.js
index fcb0dbd..f557f40 100644
--- a/modules/ve/ce/ve.ce.js
+++ b/modules/ve/ce/ve.ce.js
@@ -157,10 +157,11 @@
                item = current[0][current[1]];
                if ( item.nodeType === Node.TEXT_NODE ) {
                        if ( item === domNode ) {
-                               offset += domOffset;
+                               // domOffset is a byte offset, convert it to a 
character offset
+                               offset += ve.getCharacterOffset( 
item.textContent, domOffset );
                                break;
                        } else {
-                               offset += item.textContent.length;
+                               offset += ve.getCharacterOffset( 
item.textContent, item.textContent.length );
                        }
                } else if ( item.nodeType === Node.ELEMENT_NODE ) {
                        $item = current[0].eq( current[1] );
@@ -173,9 +174,9 @@
                        } else if ( $item.hasClass( 've-ce-branchNode' ) ) {
                                offset += $item.data( 'view' ).getOuterLength();
                        } else {
-                               stack.push( [$item.contents(), 0 ] );
+                               stack.push( [ $item.contents(), 0 ] );
                                current[1]++;
-                               current = stack[stack.length-1];
+                               current = stack[ stack.length - 1 ];
                                continue;
                        }
                }
diff --git a/modules/ve/dm/lineardata/ve.dm.ElementLinearData.js 
b/modules/ve/dm/lineardata/ve.dm.ElementLinearData.js
index c6c83de..0f68431 100644
--- a/modules/ve/dm/lineardata/ve.dm.ElementLinearData.js
+++ b/modules/ve/dm/lineardata/ve.dm.ElementLinearData.js
@@ -320,7 +320,11 @@
                element = this.getData( offset );
        }
 
-       return element.annotations || element[1] || [];
+       if ( typeof element === 'string' ) {
+               return [];
+       } else {
+               return element.annotations || element[1];
+       }
 };
 
 /**
diff --git a/modules/ve/dm/ve.dm.Converter.js b/modules/ve/dm/ve.dm.Converter.js
index acdf97e..6f51ae6 100644
--- a/modules/ve/dm/ve.dm.Converter.js
+++ b/modules/ve/dm/ve.dm.Converter.js
@@ -41,7 +41,7 @@
  */
 ve.dm.Converter.getDataContentFromText = function ( text, annotations ) {
        var i, len,
-               characters = text.split( '' );
+               characters = ve.splitCharacters( text );
 
        if ( !annotations || annotations.isEmpty() ) {
                return characters;
diff --git a/modules/ve/dm/ve.dm.Document.js b/modules/ve/dm/ve.dm.Document.js
index 859e9f3..8c0da12 100644
--- a/modules/ve/dm/ve.dm.Document.js
+++ b/modules/ve/dm/ve.dm.Document.js
@@ -480,12 +480,19 @@
  * @method
  * @param {Array} data Snippet of linear model data to insert
  * @param {number} offset Offset in the linear model where the caller wants to 
insert data
- * @returns {Object} A (possibly modified) copy of data and a (possibly 
modified) offset
+ * @returns {Object} A (possibly modified) copy of data, a (possibly modified) 
offset
+ * and a number of elements to remove
  */
 ve.dm.Document.prototype.fixupInsertion = function ( data, offset ) {
        var
                // Array where we build the return value
                newData = [],
+
+               // Temporary variables for handling combining marks
+               insert, annotations,
+               // An unattached combining mark may require the insertion to 
remove a character,
+               // so we send this counter back in the result
+               remove = 0,
 
                // *** Stacks ***
                // Array of element openings (object). Openings in data are 
pushed onto this stack
@@ -705,6 +712,29 @@
                                }
                        } while ( !childrenOK );
 
+                       if (
+                               i === 0 &&
+                               childType === 'text' &&
+                               ve.isUnattachedCombiningMark( data[i] )
+                       ) {
+                               // Note we only need to check data[0] as 
combining marks further
+                               // along should already have been merged
+                               if ( doc.data.isElementData( offset - 1 ) ) {
+                                       // Inserting a unattached combining 
mark is generally pretty badly
+                                       // supported (browser rendering bugs), 
so we'll just prevent it.
+                                       continue;
+                               } else {
+                                       offset--;
+                                       remove++;
+                                       insert = doc.data.getCharacterData( 
offset ) + data[i];
+                                       annotations = 
doc.data.getAnnotationIndexesFromOffset( offset );
+                                       if ( annotations.length ) {
+                                               insert = [ insert, annotations 
];
+                                       }
+                                       data[i] = insert;
+                               }
+                       }
+
                        for ( j = 0; j < closings.length; j++ ) {
                                // writeElement() would update 
openingStack/closingStack, but we've already done
                                // that for closings
@@ -762,7 +792,8 @@
 
        return {
                offset: offset,
-               data: newData
+               data: newData,
+               remove: remove
        };
 };
 
diff --git a/modules/ve/dm/ve.dm.SurfaceFragment.js 
b/modules/ve/dm/ve.dm.SurfaceFragment.js
index 96f93da..1654d1a 100644
--- a/modules/ve/dm/ve.dm.SurfaceFragment.js
+++ b/modules/ve/dm/ve.dm.SurfaceFragment.js
@@ -525,7 +525,7 @@
        }
        // Auto-convert content to array of plain text characters
        if ( typeof content === 'string' ) {
-               content = content.split( '' );
+               content = ve.splitCharacters( content );
        }
        if ( content.length ) {
                if ( annotate ) {
diff --git a/modules/ve/dm/ve.dm.Transaction.js 
b/modules/ve/dm/ve.dm.Transaction.js
index df3c542..460e899 100644
--- a/modules/ve/dm/ve.dm.Transaction.js
+++ b/modules/ve/dm/ve.dm.Transaction.js
@@ -37,7 +37,7 @@
        // Retain up to insertion point, if needed
        tx.pushRetain( insertion.offset );
        // Insert data
-       tx.pushReplace( doc, offset, 0, insertion.data );
+       tx.pushReplace( doc, insertion.offset, insertion.remove, insertion.data 
);
        // Retain to end of document, if needed (for completeness)
        tx.pushRetain( data.length - insertion.offset );
        return tx;
diff --git a/modules/ve/test/dm/ve.dm.Converter.test.js 
b/modules/ve/test/dm/ve.dm.Converter.test.js
index 675ebcb..7c8da34 100644
--- a/modules/ve/test/dm/ve.dm.Converter.test.js
+++ b/modules/ve/test/dm/ve.dm.Converter.test.js
@@ -100,7 +100,7 @@
                                store.index( cases[msg].storeItems[i].value, 
cases[msg].storeItems[i].hash );
                        }
                }
-               if( cases[msg].modify ) {
+               if ( cases[msg].modify ) {
                        cases[msg].modify( cases[msg].data );
                }
                doc = new ve.dm.Document( ve.dm.example.preprocessAnnotations( 
cases[msg].data, store ) );
diff --git a/modules/ve/ve.js b/modules/ve/ve.js
index c95da7b..f491b96 100644
--- a/modules/ve/ve.js
+++ b/modules/ve/ve.js
@@ -815,6 +815,47 @@
        };
 
        /**
+        * Split a string into individual characters, leaving multibyte 
characters as one item
+        *
+        * @param {string} text Text to split
+        * @returns {string[]} Array of characters
+        */
+       ve.splitCharacters = function ( text ) {
+               return text.split( /(?![\uDC00-\uDFFF\u0300-\u036F])/g ); // 
don't split UTF surrogate pairs
+       };
+
+       /**
+        * Determine if the text consists of only unattached combining marks
+        * @param {string} text Text to test
+        * @returns {boolean} The text is unattached combining marks
+        */
+       ve.isUnattachedCombiningMark = function ( text ) {
+               return ( /^[\u0300-\u036F]+$/ ).test( text );
+       };
+
+       /**
+        * Convert a character offset to a byte offset
+        *
+        * @param {string} text Text in which to calculate offset
+        * @param {number} characterOffset Character offset
+        * @returns {number} Byte offset
+        */
+       ve.getByteOffset = function ( text, characterOffset ) {
+               return ve.splitCharacters( text ).slice( 0, characterOffset 
).join( '' ).length;
+       };
+
+       /**
+        * Convert a byte offset to a character offset
+        *
+        * @param {string} text Text in which to calculate offset
+        * @param {number} byteOffset Byte offset
+        * @returns {number} Character offset
+        */
+       ve.getCharacterOffset = function ( text, byteOffset ) {
+               return ve.splitCharacters( text.substring( 0, byteOffset ) 
).length;
+       };
+
+       /**
         * Escapes non-word characters so they can be safely used as HTML 
attribute values.
         *
         * This method is basically a copy of mw.html.escape.

-- 
To view, visit https://gerrit.wikimedia.org/r/64965
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I8d936fb15d82f73cd45fac142c540a7950850d55
Gerrit-PatchSet: 9
Gerrit-Project: mediawiki/extensions/VisualEditor
Gerrit-Branch: master
Gerrit-Owner: Esanders <esand...@wikimedia.org>
Gerrit-Reviewer: Catrope <roan.katt...@gmail.com>
Gerrit-Reviewer: Divec <da...@sheetmusic.org.uk>
Gerrit-Reviewer: Esanders <esand...@wikimedia.org>
Gerrit-Reviewer: jenkins-bot

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to