jenkins-bot has submitted this change and it was merged. Change subject: Revert model to use simple UTF-16 code units ......................................................................
Revert model to use simple UTF-16 code units This is a prerequisite to browser-based grapheme cluster handling, which is needed so left/right cursoring and backspace behave as users expect. modules/ve/ve.js modules/ve/ce/ve.ce.Document.js modules/ve/ce/ve.ce.js * Revert cluster-aware splitting to trivial javascript code unit splitting * Rewrite ve.splitClusters as a trivial compatibility method (remove soon) * getClusterOffset/getByteOffset use unicodeJS.graphemebreak.splitClusters modules/unicodejs/tools/unicodejs-properties.py modules/unicodejs/unicodejs.graphemebreakproperties.js modules/unicodejs/unicodejs.js * Allow grapheme break tests to work with surrogate pairs demos/ve/pages/minimal.html demos/ve/pages/multibyte.html demos/ve/pages/unicode.html * replace file with more precise tests modules/ve/test/ve.test.js * Remove reference to grapheme-based splitting (which is no longer used) * Correct typo Bug: 53757 Bug: 51472 Bug: 51596 Bug: 51846 Change-Id: Ife34c87ebe40bc1689298b592eec5c0cdc2f7589 --- D demos/ve/pages/multibyte.html M modules/unicodejs/tools/unicodejs-properties.py M modules/unicodejs/unicodejs.js M modules/ve/ce/ve.ce.Document.js M modules/ve/ce/ve.ce.js M modules/ve/test/ve.test.js M modules/ve/ve.js 7 files changed, 18 insertions(+), 25 deletions(-) Approvals: Catrope: Looks good to me, approved jenkins-bot: Verified diff --git a/demos/ve/pages/multibyte.html b/demos/ve/pages/multibyte.html deleted file mode 100644 index 979b312..0000000 --- a/demos/ve/pages/multibyte.html +++ /dev/null @@ -1,6 +0,0 @@ -<p>12𨋢456789𨋢bc</p> -<p>「𨋢」字響<tt>香港</tt>衍生出好多新詞,好似:𨋢<tt>香港</tt> abc</p> -<p>abc</p> -<p>one c̀ombining accent</p> -<p>two ç̀ombining accents</p> -<p>def</p> \ No newline at end of file diff --git a/modules/unicodejs/tools/unicodejs-properties.py b/modules/unicodejs/tools/unicodejs-properties.py index f191de2..1977b74 100644 --- a/modules/unicodejs/tools/unicodejs-properties.py +++ b/modules/unicodejs/tools/unicodejs-properties.py @@ -24,7 +24,7 @@ if not m: raise ValueError( "Bad line: %r" % line ) start, end, prop = m.groups() - if start == 'D800' and end == 'DFFF': + if breaktype == 'Grapheme' and start == 'D800' and end == 'DFFF': continue # raw surrogates are not treated if not ranges.has_key( prop ): diff --git a/modules/unicodejs/unicodejs.js b/modules/unicodejs/unicodejs.js index e513ffb..a03270d 100644 --- a/modules/unicodejs/unicodejs.js +++ b/modules/unicodejs/unicodejs.js @@ -179,7 +179,6 @@ throw new Error( 'range includes surrogates: ' + min.toString( 16 ) + '-' + max.toString( 16 ) ); } - if ( max <= 0xFFFF ) { // interval is entirely BMP characterClass.push( codeUnitRange( min, max ) ); diff --git a/modules/ve/ce/ve.ce.Document.js b/modules/ve/ce/ve.ce.Document.js index 7f2c383..86b91ea 100644 --- a/modules/ve/ce/ve.ce.Document.js +++ b/modules/ve/ce/ve.ce.Document.js @@ -148,12 +148,11 @@ } item = current[0][current[1]]; if ( item.nodeType === Node.TEXT_NODE ) { - // offset, startOffset and length are all data model lengths (not byte lengths) - length = ve.getClusterOffset( item.textContent, item.textContent.length ); + length = item.textContent.length; if ( offset >= startOffset && offset <= startOffset + length ) { return { node: item, - offset: ve.getByteOffset( item.textContent, offset - startOffset ) + offset: offset - startOffset }; } else { startOffset += length; diff --git a/modules/ve/ce/ve.ce.js b/modules/ve/ce/ve.ce.js index 894840c..5817aa1 100644 --- a/modules/ve/ce/ve.ce.js +++ b/modules/ve/ce/ve.ce.js @@ -157,11 +157,10 @@ item = current[0][current[1]]; if ( item.nodeType === Node.TEXT_NODE ) { if ( item === domNode ) { - // domOffset is a byte offset, convert it to a grapheme cluster offset - offset += ve.getClusterOffset( item.textContent, domOffset ); + offset += domOffset; break; } else { - offset += ve.getClusterOffset( item.textContent, item.textContent.length ); + offset += item.textContent.length; } } else if ( item.nodeType === Node.ELEMENT_NODE ) { $item = current[0].eq( current[1] ); diff --git a/modules/ve/test/ve.test.js b/modules/ve/test/ve.test.js index 8033dc3..d89461b 100644 --- a/modules/ve/test/ve.test.js +++ b/modules/ve/test/ve.test.js @@ -260,13 +260,11 @@ } } ); -// ve.splitClusters: Tested upstream (UnicodeJS) - // TODO: ve.isUnattachedCombiningMark // TODO: ve.getByteOffset -// TODO: ve.getCharacterOffset +// TODO: ve.getClusterOffset QUnit.test( 'graphemeSafeSubstring', function ( assert ) { var i, text = '12\ud860\udee245\ud860\udee2789\ud860\udee2bc', cases = [ diff --git a/modules/ve/ve.js b/modules/ve/ve.js index 3612b51..2e466e5 100644 --- a/modules/ve/ve.js +++ b/modules/ve/ve.js @@ -403,12 +403,14 @@ return ve.init.platform.getMessage.apply( ve.init.platform, arguments ); }; - /** - * @method - * @inheritdoc unicodeJS.graphemebreak#splitClusters - * @see unicodeJS.graphemebreak#splitClusters - */ - ve.splitClusters = unicodeJS.graphemebreak.splitClusters; + /** + * Compatibility method. We no longer split into clusters at this level. + * + * TODO: strip out calls to splitClusters then delete this method. + */ + ve.splitClusters = function ( text ) { + return text.split( '' ); + }; /** * Determine if the text consists of only unattached combining marks. @@ -428,7 +430,8 @@ * @returns {number} Byte offset */ ve.getByteOffset = function ( text, clusterOffset ) { - return ve.splitClusters( text ).slice( 0, clusterOffset ).join( '' ).length; + return unicodeJS.graphemebreak.splitClusters( text ).slice( 0, clusterOffset + ).join( '' ).length; }; /** @@ -439,7 +442,8 @@ * @returns {number} Grapheme cluster offset */ ve.getClusterOffset = function ( text, byteOffset ) { - return ve.splitClusters( text.substring( 0, byteOffset ) ).length; + return unicodeJS.graphemebreak.splitClusters( text.substring( 0, byteOffset + ) ).length; }; /** -- To view, visit https://gerrit.wikimedia.org/r/80689 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: Ife34c87ebe40bc1689298b592eec5c0cdc2f7589 Gerrit-PatchSet: 20 Gerrit-Project: mediawiki/extensions/VisualEditor Gerrit-Branch: master Gerrit-Owner: Divec <da...@sheetmusic.org.uk> Gerrit-Reviewer: Catrope <roan.katt...@gmail.com> Gerrit-Reviewer: Divec <da...@sheetmusic.org.uk> Gerrit-Reviewer: Esanders <esand...@wikimedia.org> Gerrit-Reviewer: Inez <i...@wikia-inc.com> Gerrit-Reviewer: Jforrester <jforres...@wikimedia.org> Gerrit-Reviewer: Krinkle <krinklem...@gmail.com> Gerrit-Reviewer: jenkins-bot _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits