jenkins-bot has submitted this change and it was merged. (
https://gerrit.wikimedia.org/r/356582 )
Change subject: Better change/add/delete distinction for DiffOps
......................................................................
Better change/add/delete distinction for DiffOps
Improve handling of change/add/delete ops in diff engine so Change 319866 gets
a chance to run.
This uses existing diff code to compare the LHS and RHS of every
DiffOp::change, and convert it to an add+delete pair if they are too dissimilar
(current threshold is 25% identical chars). This functionality is already in
Change 319866. This patch contains only the change vs add+del stuff without
moved-paragraph-detection, for easier review.
Bug: T150740
Bug: T149674
Change-Id: If0952aa53e472c23237890ee6d6997ac9d6d7c0e
---
M DiffEngine.h
M InlineDiff.cpp
M TableDiff.cpp
M Wikidiff2.cpp
M Wikidiff2.h
M config.m4
M php_wikidiff2.cpp
M tests/004.phpt
A tests/008.phpt
A textutil.h
10 files changed, 345 insertions(+), 178 deletions(-)
Approvals:
MaxSem: Looks good to me, approved
jenkins-bot: Verified
diff --git a/DiffEngine.h b/DiffEngine.h
index fc8fc5d..76d0c0b 100644
--- a/DiffEngine.h
+++ b/DiffEngine.h
@@ -13,12 +13,20 @@
#include <utility>
#include <algorithm>
#include <cassert>
+#include <string>
+#include <numeric>
#ifdef USE_JUDY
#include "JudyHS.h"
#endif
#include "Wikidiff2.h"
+#include "Word.h"
+#include "textutil.h"
+
+// helper function to calculate similarity of text lines, based on existing
diff code.
+// used in DiffEngine and Wikidiff2.
+double calculateSimilarity(TextUtil::WordVector& words1, TextUtil::WordVector&
words2, long long bailoutComplexity, int *opCountPtr = nullptr);
/**
* Diff operation
@@ -138,6 +146,8 @@
int lcs;
bool done;
enum {MAX_CHUNKS=8};
+ void detectDissimilarChanges(PointerVector& del, PointerVector&
add, Diff<T>& diff, long long bailoutComplexity);
+ bool looksLikeChange(const T& del, const T& add, long long
bailoutComplexity);
};
//-----------------------------------------------------------------------------
@@ -155,6 +165,41 @@
seq.clear();
in_seq.clear();
done = false;
+}
+
+// for a DiffOp::change, decide whether it should be treated as a successive
add and delete based on similarity.
+template<typename T>
+inline bool DiffEngine<T>::looksLikeChange(const T& del, const T& add, long
long bailoutComplexity)
+{
+ TextUtil::WordVector words1, words2;
+ TextUtil::explodeWords(del, words1);
+ TextUtil::explodeWords(add, words2);
+ return calculateSimilarity(words1, words2, bailoutComplexity) > 0.25;
+}
+
+// go through list of changed lines. if they are too dissimilar, convert to
del+add.
+template<typename T>
+inline void DiffEngine<T>::detectDissimilarChanges(PointerVector& del,
PointerVector& add, Diff<T>& diff, long long bailoutComplexity)
+{
+ int i;
+ static PointerVector empty;
+ for (i = 0; i<del.size() && i<add.size() && !looksLikeChange(*del[i],
*add[i], bailoutComplexity); ++i) {
+ PointerVector d, a;
+ d.push_back(del[i]);
+ a.push_back(add[i]);
+ diff.add_edit(DiffOp<T>(DiffOp<T>::del, d, empty));
+ diff.add_edit(DiffOp<T>(DiffOp<T>::add, empty, a));
+ }
+ if (i) {
+ add.erase(add.begin(), add.begin()+i);
+ del.erase(del.begin(), del.begin()+i);
+ }
+}
+
+template<>
+inline void DiffEngine<Word>::detectDissimilarChanges(PointerVector& del,
PointerVector& add, Diff<Word>& diff, long long bailoutComplexity)
+{
+ // compiles to no-op in Word specialization.
}
template<typename T>
@@ -266,8 +311,18 @@
while (yi < n_to && ychanged[yi])
add.push_back(&to_lines[yi++]);
+ detectDissimilarChanges(del, add, diff, bailoutComplexity);
+
if (del.size() && add.size())
+#ifdef DIFFENGINE__EVERY_CHANGE_IS_AN_ADD_AND_DELETE
+ // for generating a worst-case benchmark of the "show moved
paragraphs" patch (gerrit change 319866)
+ {
+ diff.add_edit(DiffOp<T>(DiffOp<T>::del, del, empty));
+ diff.add_edit(DiffOp<T>(DiffOp<T>::add, empty, add));
+ }
+#else
diff.add_edit(DiffOp<T>(DiffOp<T>::change, del, add));
+#endif
else if (del.size())
diff.add_edit(DiffOp<T>(DiffOp<T>::del, del, empty));
else if (add.size())
@@ -601,4 +656,53 @@
engine.diff(from_lines, to_lines, *this, bailoutComplexity);
}
+inline double calculateSimilarity(TextUtil::WordVector& words1,
TextUtil::WordVector& words2, long long bailoutComplexity, int *opCountPtr /* =
nullptr*/)
+{
+ typedef Diff<Word> WordDiff;
+ WordDiff diff(words1, words2, bailoutComplexity);
+ int charsTotal = 0;
+ int opCharCount[4] = { 0 };
+ double similarity;
+ auto countOpChars = [] (DiffEngine<Word>::PointerVector& p) {
+ return std::accumulate(p.begin(), p.end(), 0, [] (int a, const
Word *b) {
+ return a + (b->suffixEnd - b->bodyStart);
+ });
+ };
+ for (int i = 0; i < diff.size(); ++i) {
+ int op = diff[i].op;
+ int charCount;
+ switch (diff[i].op) {
+ case DiffOp<Word>::del:
+ case DiffOp<Word>::copy:
+ charCount = countOpChars(diff[i].from);
+ break;
+ case DiffOp<Word>::add:
+ charCount = countOpChars(diff[i].to);
+ break;
+ case DiffOp<Word>::change:
+ charCount =
std::max(countOpChars(diff[i].from), countOpChars(diff[i].to));
+ break;
+ }
+ opCharCount[op] += charCount;
+ charsTotal += charCount;
+ }
+ if (opCharCount[DiffOp<Word>::copy] == 0) {
+ similarity = 0.0;
+ } else {
+ if (charsTotal) {
+ similarity = double(opCharCount[DiffOp<Word>::copy]) /
charsTotal;
+ } else {
+ similarity = 0.0;
+ }
+ }
+
+ if (opCountPtr) {
+ for(int i = 0; i < sizeof(opCharCount)/sizeof(opCharCount[0]);
++i) {
+ opCountPtr[i] = opCharCount[i];
+ }
+ }
+
+ return similarity;
+}
+
#endif
diff --git a/InlineDiff.cpp b/InlineDiff.cpp
index d60215c..bc2ed54 100644
--- a/InlineDiff.cpp
+++ b/InlineDiff.cpp
@@ -14,8 +14,8 @@
{
WordVector words1, words2;
- explodeWords(text1, words1);
- explodeWords(text2, words2);
+ TextUtil::explodeWords(text1, words1);
+ TextUtil::explodeWords(text2, words2);
WordDiff worddiff(words1, words2, MAX_WORD_LEVEL_DIFF_COMPLEXITY);
String word;
diff --git a/TableDiff.cpp b/TableDiff.cpp
index 5a4969f..9c44cc2 100644
--- a/TableDiff.cpp
+++ b/TableDiff.cpp
@@ -27,8 +27,8 @@
{
WordVector words1, words2;
- explodeWords(text1, words1);
- explodeWords(text2, words2);
+ TextUtil::explodeWords(text1, words1);
+ TextUtil::explodeWords(text2, words2);
WordDiff worddiff(words1, words2, MAX_WORD_LEVEL_DIFF_COMPLEXITY);
//debugPrintWordDiff(worddiff);
diff --git a/Wikidiff2.cpp b/Wikidiff2.cpp
index d579b61..b5ffc05 100644
--- a/Wikidiff2.cpp
+++ b/Wikidiff2.cpp
@@ -8,9 +8,6 @@
#include <stdio.h>
#include <string.h>
#include "Wikidiff2.h"
-#include <thai/thailib.h>
-#include <thai/thwchar.h>
-#include <thai/thbrk.h>
void Wikidiff2::diffLines(const StringVector & lines1, const StringVector &
lines2,
@@ -162,144 +159,6 @@
// Append the rest of the string after the last special character
if (start < input.size()) {
result.append(input, start, input.size() - start);
- }
-}
-
-// Weak UTF-8 decoder
-// Will return garbage on invalid input (overshort sequences, overlong
sequences, etc.)
-int Wikidiff2::nextUtf8Char(String::const_iterator & p, String::const_iterator
& charStart,
- String::const_iterator end)
-{
- int c = 0;
- unsigned char byte;
- int seqLength = 0;
- charStart = p;
- if (p == end) {
- return 0;
- }
- do {
- byte = (unsigned char)*p;
- if (byte < 0x80) {
- c = byte;
- seqLength = 0;
- } else if (byte >= 0xc0) {
- // Start of UTF-8 character
- // If this is unexpected, due to an overshort sequence,
we ignore the invalid
- // sequence and resynchronise here
- if (byte < 0xe0) {
- seqLength = 1;
- c = byte & 0x1f;
- } else if (byte < 0xf0) {
- seqLength = 2;
- c = byte & 0x0f;
- } else {
- seqLength = 3;
- c = byte & 7;
- }
- } else if (seqLength) {
- c <<= 6;
- c |= byte & 0x3f;
- --seqLength;
- } else {
- // Unexpected continuation, ignore
- }
- ++p;
- } while (seqLength && p != end);
- return c;
-}
-
-// Split a string into words
-//
-// TODO: I think the best way to do this would be to use ICU BreakIterator
-// instead of libthai + DIY. Basically you'd run BreakIterators from several
-// different locales (en, th, ja) and merge the results, i.e. if a break occurs
-// in any locale at a given position, split the string. I don't know if the
-// quality of the Thai dictionary in ICU matches the one in libthai, we would
-// have to check this somehow.
-void Wikidiff2::explodeWords(const String & text, WordVector &words)
-{
- // Decode the UTF-8 in the string.
- // * Save the character sizes (in bytes)
- // * Convert the string to TIS-620, which is the internal character set
of libthai.
- // * Save the character offsets of any break positions (same format as
libthai).
-
- String tisText, charSizes;
- String::const_iterator suffixEnd, charStart, p;
- IntSet breaks;
-
- tisText.reserve(text.size());
- charSizes.reserve(text.size());
- wchar_t ch, lastChar;
- thchar_t thaiChar;
- bool hasThaiChars = false;
-
- p = text.begin();
- ch = nextUtf8Char(p, charStart, text.end());
- lastChar = 0;
- int charIndex = 0;
- while (ch) {
- thaiChar = th_uni2tis(ch);
- if (thaiChar >= 0x80 && thaiChar != THCHAR_ERR) {
- hasThaiChars = true;
- }
- tisText += (char)thaiChar;
- charSizes += (char)(p - charStart);
-
- if (isLetter(ch)) {
- if (lastChar && !isLetter(lastChar)) {
- breaks.insert(charIndex);
- }
- } else {
- breaks.insert(charIndex);
- }
- charIndex++;
- lastChar = ch;
- ch = nextUtf8Char(p, charStart, text.end());
- }
-
- // If there were any Thai characters in the string, run th_brk on it
and add
- // the resulting break positions
- if (hasThaiChars) {
- IntVector thaiBreakPositions;
- tisText += '\0';
- thaiBreakPositions.resize(tisText.size());
- int numBreaks = th_brk((const thchar_t*)(tisText.data()),
- &thaiBreakPositions[0],
thaiBreakPositions.size());
- thaiBreakPositions.resize(numBreaks);
- breaks.insert(thaiBreakPositions.begin(),
thaiBreakPositions.end());
- }
-
- // Add a fake end-of-string character and have a break on it, so that
the
- // last word gets added without special handling
- breaks.insert(charSizes.size());
- charSizes += (char)0;
-
- // Now make the word array by traversing the breaks set
- p = text.begin();
- IntSet::iterator pBrk = breaks.begin();
- String::const_iterator wordStart = text.begin();
- String::const_iterator suffixStart = text.end();
-
- // If there's a break at the start of the string, skip it
- if (pBrk != breaks.end() && *pBrk == 0) {
- pBrk++;
- }
-
- for (charIndex = 0; charIndex < charSizes.size(); p +=
charSizes[charIndex++]) {
- // Assume all spaces are ASCII
- if (isSpace(*p)) {
- suffixStart = p;
- }
- if (pBrk != breaks.end() && charIndex == *pBrk) {
- if (suffixStart == text.end()) {
- words.push_back(Word(wordStart, p, p));
- } else {
- words.push_back(Word(wordStart, suffixStart,
p));
- }
- pBrk++;
- suffixStart = text.end();
- wordStart = p;
- }
}
}
diff --git a/Wikidiff2.h b/Wikidiff2.h
index da75e7b..8111043 100644
--- a/Wikidiff2.h
+++ b/Wikidiff2.h
@@ -43,40 +43,10 @@
virtual void printContext(const String & input) = 0;
void printText(const String & input);
- inline bool isLetter(int ch);
- inline bool isSpace(int ch);
void debugPrintWordDiff(WordDiff & worddiff);
- int nextUtf8Char(String::const_iterator & p,
String::const_iterator & charStart,
- String::const_iterator end);
-
- void explodeWords(const String & text, WordVector &tokens);
void explodeLines(const String & text, StringVector &lines);
};
-
-inline bool Wikidiff2::isLetter(int ch)
-{
- // Standard alphanumeric
- if ((ch >= '0' && ch <= '9') ||
- (ch == '_') ||
- (ch >= 'A' && ch <= 'Z') ||
- (ch >= 'a' && ch <= 'z'))
- {
- return true;
- }
- // Punctuation and control characters
- if (ch < 0xc0) return false;
- // Chinese, Japanese: split up character by character
- if (ch >= 0x3000 && ch <= 0x9fff) return false;
- if (ch >= 0x20000 && ch <= 0x2a000) return false;
- // Otherwise assume it's from a language that uses spaces
- return true;
-}
-
-inline bool Wikidiff2::isSpace(int ch)
-{
- return ch == ' ' || ch == '\t';
-}
inline const Wikidiff2::String & Wikidiff2::getResult() const
{
diff --git a/config.m4 b/config.m4
index b848398..c9e2197 100644
--- a/config.m4
+++ b/config.m4
@@ -36,6 +36,6 @@
PHP_SUBST(WIKIDIFF2_SHARED_LIBADD)
AC_DEFINE(HAVE_WIKIDIFF2, 1, [ ])
- export CXXFLAGS="-Wno-write-strings $CXXFLAGS"
+ export CXXFLAGS="-Wno-write-strings -std=c++11 $CXXFLAGS"
PHP_NEW_EXTENSION(wikidiff2, php_wikidiff2.cpp Wikidiff2.cpp TableDiff.cpp
InlineDiff.cpp, $ext_shared)
fi
diff --git a/php_wikidiff2.cpp b/php_wikidiff2.cpp
index e15ad2b..6f77fc1 100644
--- a/php_wikidiff2.cpp
+++ b/php_wikidiff2.cpp
@@ -44,7 +44,6 @@
STANDARD_MODULE_PROPERTIES
};
-
#ifdef COMPILE_DL_WIKIDIFF2
ZEND_GET_MODULE(wikidiff2)
#endif
diff --git a/tests/004.phpt b/tests/004.phpt
index 32594d4..3dc0fa5 100644
--- a/tests/004.phpt
+++ b/tests/004.phpt
@@ -32,7 +32,8 @@
<div class="mw-diff-inline-changed">foo <del>bar</del><ins>test</ins></div>
<div class="mw-diff-inline-deleted"><del> </del></div>
<div class="mw-diff-inline-context">baz</div>
-<div class="mw-diff-inline-changed"><del>quux</del><ins>test</ins></div>
+<div class="mw-diff-inline-deleted"><del>quux</del></div>
+<div class="mw-diff-inline-added"><ins>test</ins></div>
<div class="mw-diff-inline-added"><ins> </ins></div>
<div class="mw-diff-inline-context">bang</div>
diff --git a/tests/008.phpt b/tests/008.phpt
new file mode 100644
index 0000000..92499c5
--- /dev/null
+++ b/tests/008.phpt
@@ -0,0 +1,54 @@
+--TEST--
+Test detection of dissimilar paragraphs
+--SKIPIF--
+<?php if (!extension_loaded("wikidiff2")) print "skip"; ?>
+--FILE--
+<?php
+$x = <<<EOT
+AAAAA AAAAA AAAAA AAAAA AAAAA AAAAA AAAAA AAAAA AAAAA AAAAA
+
+AAAAA AAAAA AAAAA AAAAA AAAAA AAAAA AAAAA AAAAA AAAAA AAAAA
+
+EOT;
+
+#---------------------------------------------------
+
+$y = <<<EOT
+AAAAA AAAAA BBBBB BBBBB BBBBB BBBBB BBBBB BBBBB BBBBB BBBBB
+
+AAAAA BBBBB BBBBB BBBBB BBBBB BBBBB BBBBB BBBBB BBBBB BBBBB
+
+EOT;
+
+#---------------------------------------------------
+
+print wikidiff2_do_diff( $x, $y, 2 );
+
+?>
+--EXPECT--
+<tr>
+ <td colspan="2" class="diff-lineno"><!--LINE 1--></td>
+ <td colspan="2" class="diff-lineno"><!--LINE 1--></td>
+</tr>
+<tr>
+ <td class="diff-marker">−</td>
+ <td class="diff-deletedline"><div>AAAAA AAAAA <del class="diffchange
diffchange-inline">AAAAA</del> <del class="diffchange
diffchange-inline">AAAAA</del> <del class="diffchange
diffchange-inline">AAAAA</del> <del class="diffchange
diffchange-inline">AAAAA</del> <del class="diffchange
diffchange-inline">AAAAA</del> <del class="diffchange
diffchange-inline">AAAAA</del> <del class="diffchange
diffchange-inline">AAAAA</del> <del class="diffchange
diffchange-inline">AAAAA</del></div></td>
+ <td class="diff-marker">+</td>
+ <td class="diff-addedline"><div>AAAAA AAAAA <ins class="diffchange
diffchange-inline">BBBBB</ins> <ins class="diffchange
diffchange-inline">BBBBB</ins> <ins class="diffchange
diffchange-inline">BBBBB</ins> <ins class="diffchange
diffchange-inline">BBBBB</ins> <ins class="diffchange
diffchange-inline">BBBBB</ins> <ins class="diffchange
diffchange-inline">BBBBB</ins> <ins class="diffchange
diffchange-inline">BBBBB</ins> <ins class="diffchange
diffchange-inline">BBBBB</ins></div></td>
+</tr>
+<tr>
+ <td class="diff-marker"> </td>
+ <td class="diff-context"></td>
+ <td class="diff-marker"> </td>
+ <td class="diff-context"></td>
+</tr>
+<tr>
+ <td class="diff-marker">−</td>
+ <td class="diff-deletedline"><div>AAAAA AAAAA AAAAA AAAAA AAAAA AAAAA AAAAA
AAAAA AAAAA AAAAA</div></td>
+ <td colspan="2" class="diff-empty"> </td>
+</tr>
+<tr>
+ <td colspan="2" class="diff-empty"> </td>
+ <td class="diff-marker">+</td>
+ <td class="diff-addedline"><div>AAAAA BBBBB BBBBB BBBBB BBBBB BBBBB BBBBB
BBBBB BBBBB BBBBB</div></td>
+</tr>
\ No newline at end of file
diff --git a/textutil.h b/textutil.h
new file mode 100644
index 0000000..6031380
--- /dev/null
+++ b/textutil.h
@@ -0,0 +1,180 @@
+#ifndef TEXTUTIL_H
+#define TEXTUTIL_H
+
+#include <thai/thailib.h>
+#include <thai/thwchar.h>
+#include <thai/thbrk.h>
+
+namespace TextUtil
+{
+ typedef std::basic_string<char, std::char_traits<char>,
WD2_ALLOCATOR<char> > String;
+ typedef std::vector<Word, WD2_ALLOCATOR<Word> > WordVector;
+ typedef std::set<int, std::less<int>, WD2_ALLOCATOR<int> > IntSet;
+ typedef std::vector<int, WD2_ALLOCATOR<int> > IntVector;
+
+ // helper functions used in both DiffEngine and Wikidiff2
+
+ inline bool isLetter(int ch)
+ {
+ // Standard alphanumeric
+ if ((ch >= '0' && ch <= '9') ||
+ (ch == '_') ||
+ (ch >= 'A' && ch <= 'Z') ||
+ (ch >= 'a' && ch <= 'z'))
+ {
+ return true;
+ }
+ // Punctuation and control characters
+ if (ch < 0xc0) return false;
+ // Chinese, Japanese: split up character by character
+ if (ch >= 0x3000 && ch <= 0x9fff) return false;
+ if (ch >= 0x20000 && ch <= 0x2a000) return false;
+ // Otherwise assume it's from a language that uses spaces
+ return true;
+ }
+
+ inline bool isSpace(int ch)
+ {
+ return ch == ' ' || ch == '\t';
+ }
+
+ // Weak UTF-8 decoder
+ // Will return garbage on invalid input (overshort sequences, overlong
sequences, etc.)
+ inline int nextUtf8Char(String::const_iterator & p,
String::const_iterator & charStart,
+ String::const_iterator end)
+ {
+ int c = 0;
+ unsigned char byte;
+ int seqLength = 0;
+ charStart = p;
+ if (p == end) {
+ return 0;
+ }
+ do {
+ byte = (unsigned char)*p;
+ if (byte < 0x80) {
+ c = byte;
+ seqLength = 0;
+ } else if (byte >= 0xc0) {
+ // Start of UTF-8 character
+ // If this is unexpected, due to an overshort
sequence, we ignore the invalid
+ // sequence and resynchronise here
+ if (byte < 0xe0) {
+ seqLength = 1;
+ c = byte & 0x1f;
+ } else if (byte < 0xf0) {
+ seqLength = 2;
+ c = byte & 0x0f;
+ } else {
+ seqLength = 3;
+ c = byte & 7;
+ }
+ } else if (seqLength) {
+ c <<= 6;
+ c |= byte & 0x3f;
+ --seqLength;
+ } else {
+ // Unexpected continuation, ignore
+ }
+ ++p;
+ } while (seqLength && p != end);
+ return c;
+ }
+
+ // Split a string into words
+ //
+ // TODO: I think the best way to do this would be to use ICU
BreakIterator
+ // instead of libthai + DIY. Basically you'd run BreakIterators from
several
+ // different locales (en, th, ja) and merge the results, i.e. if a
break occurs
+ // in any locale at a given position, split the string. I don't know if
the
+ // quality of the Thai dictionary in ICU matches the one in libthai, we
would
+ // have to check this somehow.
+ inline void explodeWords(const String & text, WordVector &words)
+ {
+ // Decode the UTF-8 in the string.
+ // * Save the character sizes (in bytes)
+ // * Convert the string to TIS-620, which is the internal
character set of libthai.
+ // * Save the character offsets of any break positions (same
format as libthai).
+
+ String tisText, charSizes;
+ String::const_iterator suffixEnd, charStart, p;
+ IntSet breaks;
+
+ tisText.reserve(text.size());
+ charSizes.reserve(text.size());
+ wchar_t ch, lastChar;
+ thchar_t thaiChar;
+ bool hasThaiChars = false;
+
+ p = text.begin();
+ ch = nextUtf8Char(p, charStart, text.end());
+ lastChar = 0;
+ int charIndex = 0;
+ while (ch) {
+ thaiChar = th_uni2tis(ch);
+ if (thaiChar >= 0x80 && thaiChar != THCHAR_ERR) {
+ hasThaiChars = true;
+ }
+ tisText += (char)thaiChar;
+ charSizes += (char)(p - charStart);
+
+ if (isLetter(ch)) {
+ if (lastChar && !isLetter(lastChar)) {
+ breaks.insert(charIndex);
+ }
+ } else {
+ breaks.insert(charIndex);
+ }
+ charIndex++;
+ lastChar = ch;
+ ch = nextUtf8Char(p, charStart, text.end());
+ }
+
+ // If there were any Thai characters in the string, run th_brk
on it and add
+ // the resulting break positions
+ if (hasThaiChars) {
+ IntVector thaiBreakPositions;
+ tisText += '\0';
+ thaiBreakPositions.resize(tisText.size());
+ int numBreaks = th_brk((const
thchar_t*)(tisText.data()),
+ &thaiBreakPositions[0],
thaiBreakPositions.size());
+ thaiBreakPositions.resize(numBreaks);
+ breaks.insert(thaiBreakPositions.begin(),
thaiBreakPositions.end());
+ }
+
+ // Add a fake end-of-string character and have a break on it,
so that the
+ // last word gets added without special handling
+ breaks.insert(charSizes.size());
+ charSizes += (char)0;
+
+ // Now make the word array by traversing the breaks set
+ p = text.begin();
+ IntSet::iterator pBrk = breaks.begin();
+ String::const_iterator wordStart = text.begin();
+ String::const_iterator suffixStart = text.end();
+
+ // If there's a break at the start of the string, skip it
+ if (pBrk != breaks.end() && *pBrk == 0) {
+ pBrk++;
+ }
+
+ for (charIndex = 0; charIndex < charSizes.size(); p +=
charSizes[charIndex++]) {
+ // Assume all spaces are ASCII
+ if (isSpace(*p)) {
+ suffixStart = p;
+ }
+ if (pBrk != breaks.end() && charIndex == *pBrk) {
+ if (suffixStart == text.end()) {
+ words.push_back(Word(wordStart, p, p));
+ } else {
+ words.push_back(Word(wordStart,
suffixStart, p));
+ }
+ pBrk++;
+ suffixStart = text.end();
+ wordStart = p;
+ }
+ }
+ }
+}
+
+#endif // TEXTUTIL_H
--
To view, visit https://gerrit.wikimedia.org/r/356582
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: If0952aa53e472c23237890ee6d6997ac9d6d7c0e
Gerrit-PatchSet: 24
Gerrit-Project: mediawiki/php/wikidiff2
Gerrit-Branch: master
Gerrit-Owner: Jkroll <[email protected]>
Gerrit-Reviewer: Awight <[email protected]>
Gerrit-Reviewer: Daniel Kinzler <[email protected]>
Gerrit-Reviewer: Jkroll <[email protected]>
Gerrit-Reviewer: MaxSem <[email protected]>
Gerrit-Reviewer: Tim Starling <[email protected]>
Gerrit-Reviewer: Tobias Gritschacher <[email protected]>
Gerrit-Reviewer: WMDE-Fisch <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits