jenkins-bot has submitted this change and it was merged.
Change subject: [BREAKING CHANGE] Use Unicode character class to calculate word
expansions
......................................................................
[BREAKING CHANGE] Use Unicode character class to calculate word expansions
ve.dm.ElementLinearData
* BREAKING CHANGE: Rename getNearestWordRange to getWordRange
* Consider character class when determining its expansions
* Add tests for single character expansion
Bug: T78202
Change-Id: I226566eacd18eff67eeab5f67088c6eb42c7cdc8
---
M src/dm/lineardata/ve.dm.ElementLinearData.js
M src/dm/ve.dm.SurfaceFragment.js
M tests/dm/lineardata/ve.dm.ElementLinearData.test.js
3 files changed, 101 insertions(+), 29 deletions(-)
Approvals:
Catrope: Looks good to me, approved
jenkins-bot: Verified
diff --git a/src/dm/lineardata/ve.dm.ElementLinearData.js
b/src/dm/lineardata/ve.dm.ElementLinearData.js
index 2db7b19..a369086 100644
--- a/src/dm/lineardata/ve.dm.ElementLinearData.js
+++ b/src/dm/lineardata/ve.dm.ElementLinearData.js
@@ -24,6 +24,16 @@
OO.inheritClass( ve.dm.ElementLinearData, ve.dm.FlatLinearData );
+/* Static Members */
+
+ve.dm.ElementLinearData.static.startWordRegExp = new RegExp(
+ '^(' + unicodeJS.characterclass.patterns.word + ')'
+);
+
+ve.dm.ElementLinearData.static.endWordRegExp = new RegExp(
+ '(' + unicodeJS.characterclass.patterns.word + ')$'
+);
+
/* Static Methods */
/**
@@ -765,38 +775,58 @@
};
/**
- * Get the nearest word boundaries as a range.
+ * Get the range of the word at offset (else a collapsed range)
*
- * The offset will first be moved to the nearest content offset if it's not at
one already.
- * Elements are always word boundaries.
+ * First, if the offset is not a content offset then it will be moved to the
nearest one.
+ * Then, if the offset is inside a word, it will be expanded to that word;
+ * else if the offset is at the end of a word, it will be expanded to that
word;
+ * else if the offset is at the start of a word, it will be expanded to that
word;
+ * else the offset is not adjacent to any word and is returned as a collapsed
range.
*
* @method
- * @param {number} offset Offset to start from
- * @returns {ve.Range} Range around nearest word boundaries
+ * @param {number} offset Offset to start from; must not be inside a surrogate
pair
+ * @returns {ve.Range} Boundaries of the adjacent word (else offset as
collapsed range)
*/
-ve.dm.ElementLinearData.prototype.getNearestWordRange = function ( offset ) {
- var offsetLeft, offsetRight,
- dataString = new ve.dm.DataString( this.getData() );
+ve.dm.ElementLinearData.prototype.getWordRange = function ( offset ) {
+ var dataString = new ve.dm.DataString( this.getData() );
offset = this.getNearestContentOffset( offset );
- // If the cursor offset is a break (i.e. the start/end of word) we
should
- // check one position either side to see if there is a non-break
- // and if so, move the offset accordingly
if ( unicodeJS.wordbreak.isBreak( dataString, offset ) ) {
- if ( !unicodeJS.wordbreak.isBreak( dataString, offset + 1 ) ) {
- offset++;
- } else if ( !unicodeJS.wordbreak.isBreak( dataString, offset -
1 ) ) {
- offset--;
+ // The cursor offset is not inside a word. See if there is an
adjacent word
+ // codepoint (checking two chars to allow surrogate pairs). If
so, expand in that
+ // direction only (preferring backwards if there are word
codepoints on both
+ // sides).
+
+ if ( this.constructor.static.endWordRegExp.exec(
+ ( dataString.read( offset - 2 ) || ' ' ) +
+ ( dataString.read( offset - 1 ) || ' ' )
+ ) ) {
+ // Cursor is immediately after a word codepoint: expand
backwards
+ return new ve.Range(
+ unicodeJS.wordbreak.prevBreakOffset(
dataString, offset ),
+ offset
+ );
+ } else if ( this.constructor.static.startWordRegExp.exec(
+ ( dataString.read( offset ) || ' ' ) +
+ ( dataString.read( offset + 1 ) || ' ' )
+ ) ) {
+ // Cursor is immediately before a word codepoint:
expand forwards
+ return new ve.Range(
+ offset,
+ unicodeJS.wordbreak.nextBreakOffset(
dataString, offset )
+ );
} else {
+ // Cursor is not adjacent to a word codepoint: do not
expand
return new ve.Range( offset );
}
+ } else {
+ // Cursor is inside a word: expand both backwards and forwards
+ return new ve.Range(
+ unicodeJS.wordbreak.prevBreakOffset( dataString, offset
),
+ unicodeJS.wordbreak.nextBreakOffset( dataString, offset
)
+ );
}
-
- offsetRight = unicodeJS.wordbreak.nextBreakOffset( dataString, offset );
- offsetLeft = unicodeJS.wordbreak.prevBreakOffset( dataString, offset );
-
- return new ve.Range( offsetLeft, offsetRight );
};
/**
diff --git a/src/dm/ve.dm.SurfaceFragment.js b/src/dm/ve.dm.SurfaceFragment.js
index 22d282a..9e862c4 100644
--- a/src/dm/ve.dm.SurfaceFragment.js
+++ b/src/dm/ve.dm.SurfaceFragment.js
@@ -337,12 +337,12 @@
case 'word':
if ( !oldRange.isCollapsed() ) {
newRange = ve.Range.static.newCoveringRange( [
- this.document.data.getNearestWordRange(
oldRange.start ),
- this.document.data.getNearestWordRange(
oldRange.end )
+ this.document.data.getWordRange(
oldRange.start ),
+ this.document.data.getWordRange(
oldRange.end )
], oldRange.isBackwards() );
} else {
// optimisation for zero-length ranges
- newRange =
this.document.data.getNearestWordRange( oldRange.start );
+ newRange = this.document.data.getWordRange(
oldRange.start );
}
break;
case 'annotation':
diff --git a/tests/dm/lineardata/ve.dm.ElementLinearData.test.js
b/tests/dm/lineardata/ve.dm.ElementLinearData.test.js
index ef9ba7e..fcc1df1 100644
--- a/tests/dm/lineardata/ve.dm.ElementLinearData.test.js
+++ b/tests/dm/lineardata/ve.dm.ElementLinearData.test.js
@@ -1230,8 +1230,8 @@
}
} );
-QUnit.test( 'getNearestWordRange', function ( assert ) {
- var i, data, range, word,
+QUnit.test( 'getWordRange', function ( assert ) {
+ var i, data, elementLinearData, range, word,
store = new ve.dm.IndexValueStore(),
cases = [
{
@@ -1340,12 +1340,48 @@
phrase: '维基百科',
msg: 'Hanzi characters (cursor in middle)',
offset: 2,
- expected: ''
+ expected: '基'
},
{
phrase: '维基百科',
msg: 'Hanzi characters (cursor at end)',
offset: 4,
+ expected: '科'
+ },
+ {
+ phrase: 'a b',
+ msg: 'Single-char word before cursor',
+ offset: 1,
+ expected: 'a'
+ },
+ {
+ phrase: 'a b',
+ msg: 'Single-char word after cursor',
+ offset: 2,
+ expected: 'b'
+ },
+ {
+ phrase: '佢地嘅𨋢壞咗',
+ msg: 'Surrogate-pair word character before
cursor',
+ offset: 5,
+ expected: '𨋢'
+ },
+ {
+ phrase: '"𨋢"=lip1',
+ msg: 'Surrogate-pair word character after
cursor',
+ offset: 1,
+ expected: '𨋢'
+ },
+ {
+ phrase: '"\uD83D\uDE00"=GRINNING_FACE',
+ msg: 'Surrogate-pair non-word character before
cursor',
+ offset: 3,
+ expected: ''
+ },
+ {
+ phrase: '"\uD83D\uDE00"=GRINNING_FACE',
+ msg: 'Surrogate-pair non-word character after
cursor',
+ offset: 1,
expected: ''
},
{
@@ -1363,9 +1399,15 @@
];
QUnit.expect( cases.length );
for ( i = 0; i < cases.length; i++ ) {
- data = new ve.dm.ElementLinearData( store,
cases[i].phrase.split( '' ) );
- range = data.getNearestWordRange( cases[i].offset );
- word = cases[i].phrase.substring( range.start, range.end );
+ // Construct the text (inside a paragraph, because
getNearestContentOffset assumes
+ // text cannot be at the very start or end of the data).
+ data = cases[i].phrase.split( '' );
+ data.unshift( { type: 'paragraph' } );
+ data.push( { type: '/paragraph' } );
+ elementLinearData = new ve.dm.ElementLinearData( store, data );
+ // Adjust offsets to account for the paragraph tag
+ range = elementLinearData.getWordRange( cases[i].offset + 1 );
+ word = cases[i].phrase.substring( range.start - 1, range.end -
1 );
assert.strictEqual( word, cases[i].expected,
cases[i].msg + ': ' +
cases[i].phrase.substring( 0, cases[i].offset ) + '│' +
--
To view, visit https://gerrit.wikimedia.org/r/197661
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I226566eacd18eff67eeab5f67088c6eb42c7cdc8
Gerrit-PatchSet: 6
Gerrit-Project: VisualEditor/VisualEditor
Gerrit-Branch: master
Gerrit-Owner: Divec <[email protected]>
Gerrit-Reviewer: Catrope <[email protected]>
Gerrit-Reviewer: Jforrester <[email protected]>
Gerrit-Reviewer: Mooeypoo <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits