[MediaWiki-commits] [Gerrit] [BREAKING CHANGE] Use Unicode character class to calculate w... - change (VisualEditor/VisualEditor)

jenkins-bot (Code Review) Wed, 18 Mar 2015 18:17:24 -0700

jenkins-bot has submitted this change and it was merged.

Change subject: [BREAKING CHANGE] Use Unicode character class to calculate word 
expansions
......................................................................



[BREAKING CHANGE] Use Unicode character class to calculate word expansions

ve.dm.ElementLinearData
* BREAKING CHANGE: Rename getNearestWordRange to getWordRange
* Consider character class when determining its expansions
* Add tests for single character expansion

Bug: T78202
Change-Id: I226566eacd18eff67eeab5f67088c6eb42c7cdc8
---
M src/dm/lineardata/ve.dm.ElementLinearData.js
M src/dm/ve.dm.SurfaceFragment.js
M tests/dm/lineardata/ve.dm.ElementLinearData.test.js
3 files changed, 101 insertions(+), 29 deletions(-)

Approvals:
  Catrope: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/src/dm/lineardata/ve.dm.ElementLinearData.js 
b/src/dm/lineardata/ve.dm.ElementLinearData.js
index 2db7b19..a369086 100644
--- a/src/dm/lineardata/ve.dm.ElementLinearData.js
+++ b/src/dm/lineardata/ve.dm.ElementLinearData.js
@@ -24,6 +24,16 @@
 
 OO.inheritClass( ve.dm.ElementLinearData, ve.dm.FlatLinearData );
 
+/* Static Members */
+
+ve.dm.ElementLinearData.static.startWordRegExp = new RegExp(
+       '^(' + unicodeJS.characterclass.patterns.word + ')'
+);
+
+ve.dm.ElementLinearData.static.endWordRegExp = new RegExp(
+       '(' + unicodeJS.characterclass.patterns.word + ')$'
+);
+
 /* Static Methods */
 
 /**
@@ -765,38 +775,58 @@
 };
 
 /**
- * Get the nearest word boundaries as a range.
+ * Get the range of the word at offset (else a collapsed range)
  *
- * The offset will first be moved to the nearest content offset if it's not at 
one already.
- * Elements are always word boundaries.
+ * First, if the offset is not a content offset then it will be moved to the 
nearest one.
+ * Then, if the offset is inside a word, it will be expanded to that word;
+ * else if the offset is at the end of a word, it will be expanded to that 
word;
+ * else if the offset is at the start of a word, it will be expanded to that 
word;
+ * else the offset is not adjacent to any word and is returned as a collapsed 
range.
  *
  * @method
- * @param {number} offset Offset to start from
- * @returns {ve.Range} Range around nearest word boundaries
+ * @param {number} offset Offset to start from; must not be inside a surrogate 
pair
+ * @returns {ve.Range} Boundaries of the adjacent word (else offset as 
collapsed range)
  */
-ve.dm.ElementLinearData.prototype.getNearestWordRange = function ( offset ) {
-       var offsetLeft, offsetRight,
-               dataString = new ve.dm.DataString( this.getData() );
+ve.dm.ElementLinearData.prototype.getWordRange = function ( offset ) {
+       var dataString = new ve.dm.DataString( this.getData() );
 
        offset = this.getNearestContentOffset( offset );
 
-       // If the cursor offset is a break (i.e. the start/end of word) we 
should
-       // check one position either side to see if there is a non-break
-       // and if so, move the offset accordingly
        if ( unicodeJS.wordbreak.isBreak( dataString, offset ) ) {
-               if ( !unicodeJS.wordbreak.isBreak( dataString, offset + 1 ) ) {
-                       offset++;
-               } else if ( !unicodeJS.wordbreak.isBreak( dataString, offset - 
1 ) ) {
-                       offset--;
+               // The cursor offset is not inside a word. See if there is an 
adjacent word
+               // codepoint (checking two chars to allow surrogate pairs). If 
so, expand in that
+               // direction only (preferring backwards if there are word 
codepoints on both
+               // sides).
+
+               if ( this.constructor.static.endWordRegExp.exec(
+                       ( dataString.read( offset - 2 ) || ' ' ) +
+                       ( dataString.read( offset - 1 ) || ' ' )
+               ) ) {
+                       // Cursor is immediately after a word codepoint: expand 
backwards
+                       return new ve.Range(
+                               unicodeJS.wordbreak.prevBreakOffset( 
dataString, offset ),
+                               offset
+                       );
+               } else if ( this.constructor.static.startWordRegExp.exec(
+                       ( dataString.read( offset ) || ' ' ) +
+                       ( dataString.read( offset + 1 ) || ' ' )
+               ) ) {
+                       // Cursor is immediately before a word codepoint: 
expand forwards
+                       return new ve.Range(
+                               offset,
+                               unicodeJS.wordbreak.nextBreakOffset( 
dataString, offset )
+                       );
                } else {
+                       // Cursor is not adjacent to a word codepoint: do not 
expand
                        return new ve.Range( offset );
                }
+       } else {
+               // Cursor is inside a word: expand both backwards and forwards
+               return new ve.Range(
+                       unicodeJS.wordbreak.prevBreakOffset( dataString, offset 
),
+                       unicodeJS.wordbreak.nextBreakOffset( dataString, offset 
)
+               );
        }
-
-       offsetRight = unicodeJS.wordbreak.nextBreakOffset( dataString, offset );
-       offsetLeft = unicodeJS.wordbreak.prevBreakOffset( dataString, offset );
-
-       return new ve.Range( offsetLeft, offsetRight );
 };
 
 /**
diff --git a/src/dm/ve.dm.SurfaceFragment.js b/src/dm/ve.dm.SurfaceFragment.js
index 22d282a..9e862c4 100644
--- a/src/dm/ve.dm.SurfaceFragment.js
+++ b/src/dm/ve.dm.SurfaceFragment.js
@@ -337,12 +337,12 @@
                case 'word':
                        if ( !oldRange.isCollapsed() ) {
                                newRange = ve.Range.static.newCoveringRange( [
-                                       this.document.data.getNearestWordRange( 
oldRange.start ),
-                                       this.document.data.getNearestWordRange( 
oldRange.end )
+                                       this.document.data.getWordRange( 
oldRange.start ),
+                                       this.document.data.getWordRange( 
oldRange.end )
                                ], oldRange.isBackwards() );
                        } else {
                                // optimisation for zero-length ranges
-                               newRange = 
this.document.data.getNearestWordRange( oldRange.start );
+                               newRange = this.document.data.getWordRange( 
oldRange.start );
                        }
                        break;
                case 'annotation':
diff --git a/tests/dm/lineardata/ve.dm.ElementLinearData.test.js 
b/tests/dm/lineardata/ve.dm.ElementLinearData.test.js
index ef9ba7e..fcc1df1 100644
--- a/tests/dm/lineardata/ve.dm.ElementLinearData.test.js
+++ b/tests/dm/lineardata/ve.dm.ElementLinearData.test.js
@@ -1230,8 +1230,8 @@
        }
 } );
 
-QUnit.test( 'getNearestWordRange', function ( assert ) {
-       var i, data, range, word,
+QUnit.test( 'getWordRange', function ( assert ) {
+       var i, data, elementLinearData, range, word,
                store = new ve.dm.IndexValueStore(),
                cases = [
                        {
@@ -1340,12 +1340,48 @@
                                phrase: '维基百科',
                                msg: 'Hanzi characters (cursor in middle)',
                                offset: 2,
-                               expected: ''
+                               expected: '基'
                        },
                        {
                                phrase: '维基百科',
                                msg: 'Hanzi characters (cursor at end)',
                                offset: 4,
+                               expected: '科'
+                       },
+                       {
+                               phrase: 'a b',
+                               msg: 'Single-char word before cursor',
+                               offset: 1,
+                               expected: 'a'
+                       },
+                       {
+                               phrase: 'a b',
+                               msg: 'Single-char word after cursor',
+                               offset: 2,
+                               expected: 'b'
+                       },
+                       {
+                               phrase: '佢地嘅𨋢壞咗',
+                               msg: 'Surrogate-pair word character before 
cursor',
+                               offset: 5,
+                               expected: '𨋢'
+                       },
+                       {
+                               phrase: '"𨋢"=lip1',
+                               msg: 'Surrogate-pair word character after 
cursor',
+                               offset: 1,
+                               expected: '𨋢'
+                       },
+                       {
+                               phrase: '"\uD83D\uDE00"=GRINNING_FACE',
+                               msg: 'Surrogate-pair non-word character before 
cursor',
+                               offset: 3,
+                               expected: ''
+                       },
+                       {
+                               phrase: '"\uD83D\uDE00"=GRINNING_FACE',
+                               msg: 'Surrogate-pair non-word character after 
cursor',
+                               offset: 1,
                                expected: ''
                        },
                        {
@@ -1363,9 +1399,15 @@
                ];
        QUnit.expect( cases.length );
        for ( i = 0; i < cases.length; i++ ) {
-               data = new ve.dm.ElementLinearData( store, 
cases[i].phrase.split( '' ) );
-               range = data.getNearestWordRange( cases[i].offset );
-               word = cases[i].phrase.substring( range.start, range.end );
+               // Construct the text (inside a paragraph, because 
getNearestContentOffset assumes
+               // text cannot be at the very start or end of the data).
+               data = cases[i].phrase.split( '' );
+               data.unshift( { type: 'paragraph' } );
+               data.push( { type: '/paragraph' } );
+               elementLinearData = new ve.dm.ElementLinearData( store, data );
+               // Adjust offsets to account for the paragraph tag
+               range = elementLinearData.getWordRange( cases[i].offset + 1 );
+               word = cases[i].phrase.substring( range.start - 1, range.end - 
1 );
                assert.strictEqual( word, cases[i].expected,
                        cases[i].msg + ': ' +
                        cases[i].phrase.substring( 0, cases[i].offset ) + '│' +

-- 
To view, visit https://gerrit.wikimedia.org/r/197661
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I226566eacd18eff67eeab5f67088c6eb42c7cdc8
Gerrit-PatchSet: 6
Gerrit-Project: VisualEditor/VisualEditor
Gerrit-Branch: master
Gerrit-Owner: Divec <[email protected]>
Gerrit-Reviewer: Catrope <[email protected]>
Gerrit-Reviewer: Jforrester <[email protected]>
Gerrit-Reviewer: Mooeypoo <[email protected]>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] [BREAKING CHANGE] Use Unicode character class to calculate w... - change (VisualEditor/VisualEditor)

Reply via email to