delete distinction for DiffOps

jenkins-bot (Code Review) Tue, 05 Sep 2017 15:49:00 -0700

jenkins-bot has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/356582 )


Change subject: Better change/add/delete distinction for DiffOps
......................................................................


Better change/add/delete distinction for DiffOps

Improve handling of change/add/delete ops in diff engine so Change 319866 gets 
a chance to run.

This uses existing diff code to compare the LHS and RHS of every 
DiffOp::change, and convert it to an add+delete pair if they are too dissimilar 
(current threshold is 25% identical chars). This functionality is already in 
Change 319866. This patch contains only the change vs add+del stuff without 
moved-paragraph-detection, for easier review.

Bug: T150740
Bug: T149674
Change-Id: If0952aa53e472c23237890ee6d6997ac9d6d7c0e
---
M DiffEngine.h
M InlineDiff.cpp
M TableDiff.cpp
M Wikidiff2.cpp
M Wikidiff2.h
M config.m4
M php_wikidiff2.cpp
M tests/004.phpt
A tests/008.phpt
A textutil.h
10 files changed, 345 insertions(+), 178 deletions(-)

Approvals:
  MaxSem: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/DiffEngine.h b/DiffEngine.h
index fc8fc5d..76d0c0b 100644
--- a/DiffEngine.h
+++ b/DiffEngine.h
@@ -13,12 +13,20 @@
 #include <utility>
 #include <algorithm>
 #include <cassert>
+#include <string>
+#include <numeric>
 
 #ifdef USE_JUDY
 #include "JudyHS.h"
 #endif
 
 #include "Wikidiff2.h"
+#include "Word.h"
+#include "textutil.h"
+
+// helper function to calculate similarity of text lines, based on existing 
diff code.
+// used in DiffEngine and Wikidiff2.
+double calculateSimilarity(TextUtil::WordVector& words1, TextUtil::WordVector& 
words2, long long bailoutComplexity, int *opCountPtr = nullptr);
 
 /**
  * Diff operation
@@ -138,6 +146,8 @@
                int lcs;
                bool done;
                enum {MAX_CHUNKS=8};
+               void detectDissimilarChanges(PointerVector& del, PointerVector& 
add, Diff<T>& diff, long long bailoutComplexity);
+               bool looksLikeChange(const T& del, const T& add, long long 
bailoutComplexity);
 };
 
 //-----------------------------------------------------------------------------
@@ -155,6 +165,41 @@
        seq.clear();
        in_seq.clear();
        done = false;
+}
+
+// for a DiffOp::change, decide whether it should be treated as a successive 
add and delete based on similarity.
+template<typename T>
+inline bool DiffEngine<T>::looksLikeChange(const T& del, const T& add, long 
long bailoutComplexity)
+{
+       TextUtil::WordVector words1, words2;
+       TextUtil::explodeWords(del, words1);
+       TextUtil::explodeWords(add, words2);
+       return calculateSimilarity(words1, words2, bailoutComplexity) > 0.25;
+}
+
+// go through list of changed lines. if they are too dissimilar, convert to 
del+add.
+template<typename T>
+inline void DiffEngine<T>::detectDissimilarChanges(PointerVector& del, 
PointerVector& add, Diff<T>& diff, long long bailoutComplexity)
+{
+       int i;
+       static PointerVector empty;
+       for (i = 0; i<del.size() && i<add.size() && !looksLikeChange(*del[i], 
*add[i], bailoutComplexity); ++i) {
+               PointerVector d, a;
+               d.push_back(del[i]);
+               a.push_back(add[i]);
+               diff.add_edit(DiffOp<T>(DiffOp<T>::del, d, empty));
+               diff.add_edit(DiffOp<T>(DiffOp<T>::add, empty, a));
+       }
+       if (i) {
+               add.erase(add.begin(), add.begin()+i);
+               del.erase(del.begin(), del.begin()+i);
+       }
+}
+
+template<>
+inline void DiffEngine<Word>::detectDissimilarChanges(PointerVector& del, 
PointerVector& add, Diff<Word>& diff, long long bailoutComplexity)
+{
+       // compiles to no-op in Word specialization.
 }
 
 template<typename T>
@@ -266,8 +311,18 @@
                while (yi < n_to && ychanged[yi])
                        add.push_back(&to_lines[yi++]);
 
+               detectDissimilarChanges(del, add, diff, bailoutComplexity);
+
                if (del.size() && add.size())
+#ifdef DIFFENGINE__EVERY_CHANGE_IS_AN_ADD_AND_DELETE
+               // for generating a worst-case benchmark of the "show moved 
paragraphs" patch (gerrit change 319866)
+               {
+                       diff.add_edit(DiffOp<T>(DiffOp<T>::del, del, empty));
+                       diff.add_edit(DiffOp<T>(DiffOp<T>::add, empty, add));
+               }
+#else
                        diff.add_edit(DiffOp<T>(DiffOp<T>::change, del, add));
+#endif
                else if (del.size())
                        diff.add_edit(DiffOp<T>(DiffOp<T>::del, del, empty));
                else if (add.size())
@@ -601,4 +656,53 @@
        engine.diff(from_lines, to_lines, *this, bailoutComplexity);
 }
 
+inline double calculateSimilarity(TextUtil::WordVector& words1, 
TextUtil::WordVector& words2, long long bailoutComplexity, int *opCountPtr /* = 
nullptr*/)
+{
+       typedef Diff<Word> WordDiff;
+       WordDiff diff(words1, words2, bailoutComplexity);
+       int charsTotal = 0;
+       int opCharCount[4] = { 0 };
+       double similarity;
+       auto countOpChars = [] (DiffEngine<Word>::PointerVector& p) {
+               return std::accumulate(p.begin(), p.end(), 0, [] (int a, const 
Word *b) {
+                       return a + (b->suffixEnd - b->bodyStart);
+               });
+       };
+       for (int i = 0; i < diff.size(); ++i) {
+               int op = diff[i].op;
+               int charCount;
+               switch (diff[i].op) {
+                       case DiffOp<Word>::del:
+                       case DiffOp<Word>::copy:
+                               charCount = countOpChars(diff[i].from);
+                               break;
+                       case DiffOp<Word>::add:
+                               charCount = countOpChars(diff[i].to);
+                               break;
+                       case DiffOp<Word>::change:
+                               charCount = 
std::max(countOpChars(diff[i].from), countOpChars(diff[i].to));
+                               break;
+               }
+               opCharCount[op] += charCount;
+               charsTotal += charCount;
+       }
+       if (opCharCount[DiffOp<Word>::copy] == 0) {
+               similarity = 0.0;
+       } else {
+               if (charsTotal) {
+                       similarity = double(opCharCount[DiffOp<Word>::copy]) / 
charsTotal;
+               } else {
+                       similarity = 0.0;
+               }
+       }
+
+       if (opCountPtr) {
+               for(int i = 0; i < sizeof(opCharCount)/sizeof(opCharCount[0]); 
++i) {
+                       opCountPtr[i] = opCharCount[i];
+               }
+       }
+
+       return similarity;
+}
+
 #endif
diff --git a/InlineDiff.cpp b/InlineDiff.cpp
index d60215c..bc2ed54 100644
--- a/InlineDiff.cpp
+++ b/InlineDiff.cpp
@@ -14,8 +14,8 @@
 {
        WordVector words1, words2;
 
-       explodeWords(text1, words1);
-       explodeWords(text2, words2);
+       TextUtil::explodeWords(text1, words1);
+       TextUtil::explodeWords(text2, words2);
        WordDiff worddiff(words1, words2, MAX_WORD_LEVEL_DIFF_COMPLEXITY);
        String word;
 
diff --git a/TableDiff.cpp b/TableDiff.cpp
index 5a4969f..9c44cc2 100644
--- a/TableDiff.cpp
+++ b/TableDiff.cpp
@@ -27,8 +27,8 @@
 {
        WordVector words1, words2;
 
-       explodeWords(text1, words1);
-       explodeWords(text2, words2);
+       TextUtil::explodeWords(text1, words1);
+       TextUtil::explodeWords(text2, words2);
        WordDiff worddiff(words1, words2, MAX_WORD_LEVEL_DIFF_COMPLEXITY);
 
        //debugPrintWordDiff(worddiff);
diff --git a/Wikidiff2.cpp b/Wikidiff2.cpp
index d579b61..b5ffc05 100644
--- a/Wikidiff2.cpp
+++ b/Wikidiff2.cpp
@@ -8,9 +8,6 @@
 #include <stdio.h>
 #include <string.h>
 #include "Wikidiff2.h"
-#include <thai/thailib.h>
-#include <thai/thwchar.h>
-#include <thai/thbrk.h>
 
 
 void Wikidiff2::diffLines(const StringVector & lines1, const StringVector & 
lines2,
@@ -162,144 +159,6 @@
        // Append the rest of the string after the last special character
        if (start < input.size()) {
                result.append(input, start, input.size() - start);
-       }
-}
-
-// Weak UTF-8 decoder
-// Will return garbage on invalid input (overshort sequences, overlong 
sequences, etc.)
-int Wikidiff2::nextUtf8Char(String::const_iterator & p, String::const_iterator 
& charStart,
-               String::const_iterator end)
-{
-       int c = 0;
-       unsigned char byte;
-       int seqLength = 0;
-       charStart = p;
-       if (p == end) {
-               return 0;
-       }
-       do {
-               byte = (unsigned char)*p;
-               if (byte < 0x80) {
-                       c = byte;
-                       seqLength = 0;
-               } else if (byte >= 0xc0) {
-                       // Start of UTF-8 character
-                       // If this is unexpected, due to an overshort sequence, 
we ignore the invalid
-                       // sequence and resynchronise here
-                       if (byte < 0xe0) {
-                               seqLength = 1;
-                               c = byte & 0x1f;
-                       } else if (byte < 0xf0) {
-                               seqLength = 2;
-                               c = byte & 0x0f;
-                       } else {
-                               seqLength = 3;
-                               c = byte & 7;
-                       }
-               } else if (seqLength) {
-                       c <<= 6;
-                       c |= byte & 0x3f;
-                       --seqLength;
-               } else {
-                       // Unexpected continuation, ignore
-               }
-               ++p;
-       } while (seqLength && p != end);
-       return c;
-}
-
-// Split a string into words
-//
-// TODO: I think the best way to do this would be to use ICU BreakIterator
-// instead of libthai + DIY. Basically you'd run BreakIterators from several
-// different locales (en, th, ja) and merge the results, i.e. if a break occurs
-// in any locale at a given position, split the string. I don't know if the
-// quality of the Thai dictionary in ICU matches the one in libthai, we would
-// have to check this somehow.
-void Wikidiff2::explodeWords(const String & text, WordVector &words)
-{
-       // Decode the UTF-8 in the string.
-       // * Save the character sizes (in bytes)
-       // * Convert the string to TIS-620, which is the internal character set 
of libthai.
-       // * Save the character offsets of any break positions (same format as 
libthai).
-
-       String tisText, charSizes;
-       String::const_iterator suffixEnd, charStart, p;
-       IntSet breaks;
-
-       tisText.reserve(text.size());
-       charSizes.reserve(text.size());
-       wchar_t ch, lastChar;
-       thchar_t thaiChar;
-       bool hasThaiChars = false;
-
-       p = text.begin();
-       ch = nextUtf8Char(p, charStart, text.end());
-       lastChar = 0;
-       int charIndex = 0;
-       while (ch) {
-               thaiChar = th_uni2tis(ch);
-               if (thaiChar >= 0x80 && thaiChar != THCHAR_ERR) {
-                       hasThaiChars = true;
-               }
-               tisText += (char)thaiChar;
-               charSizes += (char)(p - charStart);
-
-               if (isLetter(ch)) {
-                       if (lastChar && !isLetter(lastChar)) {
-                               breaks.insert(charIndex);
-                       }
-               } else {
-                       breaks.insert(charIndex);
-               }
-               charIndex++;
-               lastChar = ch;
-               ch = nextUtf8Char(p, charStart, text.end());
-       }
-
-       // If there were any Thai characters in the string, run th_brk on it 
and add
-       // the resulting break positions
-       if (hasThaiChars) {
-               IntVector thaiBreakPositions;
-               tisText += '\0';
-               thaiBreakPositions.resize(tisText.size());
-               int numBreaks = th_brk((const thchar_t*)(tisText.data()),
-                               &thaiBreakPositions[0], 
thaiBreakPositions.size());
-               thaiBreakPositions.resize(numBreaks);
-               breaks.insert(thaiBreakPositions.begin(), 
thaiBreakPositions.end());
-       }
-
-       // Add a fake end-of-string character and have a break on it, so that 
the
-       // last word gets added without special handling
-       breaks.insert(charSizes.size());
-       charSizes += (char)0;
-
-       // Now make the word array by traversing the breaks set
-       p = text.begin();
-       IntSet::iterator pBrk = breaks.begin();
-       String::const_iterator wordStart = text.begin();
-       String::const_iterator suffixStart = text.end();
-
-       // If there's a break at the start of the string, skip it
-       if (pBrk != breaks.end() && *pBrk == 0) {
-               pBrk++;
-       }
-
-       for (charIndex = 0; charIndex < charSizes.size(); p += 
charSizes[charIndex++]) {
-               // Assume all spaces are ASCII
-               if (isSpace(*p)) {
-                       suffixStart = p;
-               }
-               if (pBrk != breaks.end() && charIndex == *pBrk) {
-                       if (suffixStart == text.end()) {
-                               words.push_back(Word(wordStart, p, p));
-                       } else {
-                               words.push_back(Word(wordStart, suffixStart, 
p));
-                       }
-                       pBrk++;
-                       suffixStart = text.end();
-                       wordStart = p;
-               }
        }
 }
 
diff --git a/Wikidiff2.h b/Wikidiff2.h
index da75e7b..8111043 100644
--- a/Wikidiff2.h
+++ b/Wikidiff2.h
@@ -43,40 +43,10 @@
                virtual void printContext(const String & input) = 0;
 
                void printText(const String & input);
-               inline bool isLetter(int ch);
-               inline bool isSpace(int ch);
                void debugPrintWordDiff(WordDiff & worddiff);
 
-               int nextUtf8Char(String::const_iterator & p, 
String::const_iterator & charStart,
-                               String::const_iterator end);
-
-               void explodeWords(const String & text, WordVector &tokens);
                void explodeLines(const String & text, StringVector &lines);
 };
-
-inline bool Wikidiff2::isLetter(int ch)
-{
-       // Standard alphanumeric
-       if ((ch >= '0' && ch <= '9') ||
-          (ch == '_') ||
-          (ch >= 'A' && ch <= 'Z') ||
-          (ch >= 'a' && ch <= 'z'))
-       {
-               return true;
-       }
-       // Punctuation and control characters
-       if (ch < 0xc0) return false;
-       // Chinese, Japanese: split up character by character
-       if (ch >= 0x3000 && ch <= 0x9fff) return false;
-       if (ch >= 0x20000 && ch <= 0x2a000) return false;
-       // Otherwise assume it's from a language that uses spaces
-       return true;
-}
-
-inline bool Wikidiff2::isSpace(int ch)
-{
-       return ch == ' ' || ch == '\t';
-}
 
 inline const Wikidiff2::String & Wikidiff2::getResult() const
 {
diff --git a/config.m4 b/config.m4
index b848398..c9e2197 100644
--- a/config.m4
+++ b/config.m4
@@ -36,6 +36,6 @@
 
   PHP_SUBST(WIKIDIFF2_SHARED_LIBADD)
   AC_DEFINE(HAVE_WIKIDIFF2, 1, [ ])
-  export CXXFLAGS="-Wno-write-strings $CXXFLAGS"
+  export CXXFLAGS="-Wno-write-strings -std=c++11 $CXXFLAGS"
   PHP_NEW_EXTENSION(wikidiff2, php_wikidiff2.cpp Wikidiff2.cpp TableDiff.cpp 
InlineDiff.cpp, $ext_shared)
 fi
diff --git a/php_wikidiff2.cpp b/php_wikidiff2.cpp
index e15ad2b..6f77fc1 100644
--- a/php_wikidiff2.cpp
+++ b/php_wikidiff2.cpp
@@ -44,7 +44,6 @@
        STANDARD_MODULE_PROPERTIES
 };
 
-
 #ifdef COMPILE_DL_WIKIDIFF2
 ZEND_GET_MODULE(wikidiff2)
 #endif
diff --git a/tests/004.phpt b/tests/004.phpt
index 32594d4..3dc0fa5 100644
--- a/tests/004.phpt
+++ b/tests/004.phpt
@@ -32,7 +32,8 @@
 <div class="mw-diff-inline-changed">foo <del>bar</del><ins>test</ins></div>
 <div class="mw-diff-inline-deleted"><del>&#160;</del></div>
 <div class="mw-diff-inline-context">baz</div>
-<div class="mw-diff-inline-changed"><del>quux</del><ins>test</ins></div>
+<div class="mw-diff-inline-deleted"><del>quux</del></div>
+<div class="mw-diff-inline-added"><ins>test</ins></div>
 <div class="mw-diff-inline-added"><ins>&#160;</ins></div>
 <div class="mw-diff-inline-context">bang</div>
 
diff --git a/tests/008.phpt b/tests/008.phpt
new file mode 100644
index 0000000..92499c5
--- /dev/null
+++ b/tests/008.phpt
@@ -0,0 +1,54 @@
+--TEST--
+Test detection of dissimilar paragraphs
+--SKIPIF--
+<?php if (!extension_loaded("wikidiff2")) print "skip"; ?>
+--FILE--
+<?php
+$x = <<<EOT
+AAAAA AAAAA AAAAA AAAAA AAAAA AAAAA AAAAA AAAAA AAAAA AAAAA
+
+AAAAA AAAAA AAAAA AAAAA AAAAA AAAAA AAAAA AAAAA AAAAA AAAAA
+
+EOT;
+
+#---------------------------------------------------
+
+$y = <<<EOT
+AAAAA AAAAA BBBBB BBBBB BBBBB BBBBB BBBBB BBBBB BBBBB BBBBB
+
+AAAAA BBBBB BBBBB BBBBB BBBBB BBBBB BBBBB BBBBB BBBBB BBBBB
+
+EOT;
+
+#---------------------------------------------------
+
+print wikidiff2_do_diff( $x, $y, 2 );
+
+?>
+--EXPECT--
+<tr>
+  <td colspan="2" class="diff-lineno"><!--LINE 1--></td>
+  <td colspan="2" class="diff-lineno"><!--LINE 1--></td>
+</tr>
+<tr>
+  <td class="diff-marker">−</td>
+  <td class="diff-deletedline"><div>AAAAA AAAAA <del class="diffchange 
diffchange-inline">AAAAA</del> <del class="diffchange 
diffchange-inline">AAAAA</del> <del class="diffchange 
diffchange-inline">AAAAA</del> <del class="diffchange 
diffchange-inline">AAAAA</del> <del class="diffchange 
diffchange-inline">AAAAA</del> <del class="diffchange 
diffchange-inline">AAAAA</del> <del class="diffchange 
diffchange-inline">AAAAA</del> <del class="diffchange 
diffchange-inline">AAAAA</del></div></td>
+  <td class="diff-marker">+</td>
+  <td class="diff-addedline"><div>AAAAA AAAAA <ins class="diffchange 
diffchange-inline">BBBBB</ins> <ins class="diffchange 
diffchange-inline">BBBBB</ins> <ins class="diffchange 
diffchange-inline">BBBBB</ins> <ins class="diffchange 
diffchange-inline">BBBBB</ins> <ins class="diffchange 
diffchange-inline">BBBBB</ins> <ins class="diffchange 
diffchange-inline">BBBBB</ins> <ins class="diffchange 
diffchange-inline">BBBBB</ins> <ins class="diffchange 
diffchange-inline">BBBBB</ins></div></td>
+</tr>
+<tr>
+  <td class="diff-marker">&#160;</td>
+  <td class="diff-context"></td>
+  <td class="diff-marker">&#160;</td>
+  <td class="diff-context"></td>
+</tr>
+<tr>
+  <td class="diff-marker">−</td>
+  <td class="diff-deletedline"><div>AAAAA AAAAA AAAAA AAAAA AAAAA AAAAA AAAAA 
AAAAA AAAAA AAAAA</div></td>
+  <td colspan="2" class="diff-empty">&#160;</td>
+</tr>
+<tr>
+  <td colspan="2" class="diff-empty">&#160;</td>
+  <td class="diff-marker">+</td>
+  <td class="diff-addedline"><div>AAAAA BBBBB BBBBB BBBBB BBBBB BBBBB BBBBB 
BBBBB BBBBB BBBBB</div></td>
+</tr>
\ No newline at end of file
diff --git a/textutil.h b/textutil.h
new file mode 100644
index 0000000..6031380
--- /dev/null
+++ b/textutil.h
@@ -0,0 +1,180 @@
+#ifndef TEXTUTIL_H
+#define TEXTUTIL_H
+
+#include <thai/thailib.h>
+#include <thai/thwchar.h>
+#include <thai/thbrk.h>
+
+namespace TextUtil
+{
+       typedef std::basic_string<char, std::char_traits<char>, 
WD2_ALLOCATOR<char> > String;
+       typedef std::vector<Word, WD2_ALLOCATOR<Word> > WordVector;
+       typedef std::set<int, std::less<int>, WD2_ALLOCATOR<int> > IntSet;
+       typedef std::vector<int, WD2_ALLOCATOR<int> > IntVector;
+
+       // helper functions used in both DiffEngine and Wikidiff2
+
+       inline bool isLetter(int ch)
+       {
+               // Standard alphanumeric
+               if ((ch >= '0' && ch <= '9') ||
+                  (ch == '_') ||
+                  (ch >= 'A' && ch <= 'Z') ||
+                  (ch >= 'a' && ch <= 'z'))
+               {
+                       return true;
+               }
+               // Punctuation and control characters
+               if (ch < 0xc0) return false;
+               // Chinese, Japanese: split up character by character
+               if (ch >= 0x3000 && ch <= 0x9fff) return false;
+               if (ch >= 0x20000 && ch <= 0x2a000) return false;
+               // Otherwise assume it's from a language that uses spaces
+               return true;
+       }
+
+       inline bool isSpace(int ch)
+       {
+               return ch == ' ' || ch == '\t';
+       }
+
+       // Weak UTF-8 decoder
+       // Will return garbage on invalid input (overshort sequences, overlong 
sequences, etc.)
+       inline int nextUtf8Char(String::const_iterator & p, 
String::const_iterator & charStart,
+                       String::const_iterator end)
+       {
+               int c = 0;
+               unsigned char byte;
+               int seqLength = 0;
+               charStart = p;
+               if (p == end) {
+                       return 0;
+               }
+               do {
+                       byte = (unsigned char)*p;
+                       if (byte < 0x80) {
+                               c = byte;
+                               seqLength = 0;
+                       } else if (byte >= 0xc0) {
+                               // Start of UTF-8 character
+                               // If this is unexpected, due to an overshort 
sequence, we ignore the invalid
+                               // sequence and resynchronise here
+                               if (byte < 0xe0) {
+                                       seqLength = 1;
+                                       c = byte & 0x1f;
+                               } else if (byte < 0xf0) {
+                                       seqLength = 2;
+                                       c = byte & 0x0f;
+                               } else {
+                                       seqLength = 3;
+                                       c = byte & 7;
+                               }
+                       } else if (seqLength) {
+                               c <<= 6;
+                               c |= byte & 0x3f;
+                               --seqLength;
+                       } else {
+                               // Unexpected continuation, ignore
+                       }
+                       ++p;
+               } while (seqLength && p != end);
+               return c;
+       }
+
+       // Split a string into words
+       //
+       // TODO: I think the best way to do this would be to use ICU 
BreakIterator
+       // instead of libthai + DIY. Basically you'd run BreakIterators from 
several
+       // different locales (en, th, ja) and merge the results, i.e. if a 
break occurs
+       // in any locale at a given position, split the string. I don't know if 
the
+       // quality of the Thai dictionary in ICU matches the one in libthai, we 
would
+       // have to check this somehow.
+       inline void explodeWords(const String & text, WordVector &words)
+       {
+               // Decode the UTF-8 in the string.
+               // * Save the character sizes (in bytes)
+               // * Convert the string to TIS-620, which is the internal 
character set of libthai.
+               // * Save the character offsets of any break positions (same 
format as libthai).
+
+               String tisText, charSizes;
+               String::const_iterator suffixEnd, charStart, p;
+               IntSet breaks;
+
+               tisText.reserve(text.size());
+               charSizes.reserve(text.size());
+               wchar_t ch, lastChar;
+               thchar_t thaiChar;
+               bool hasThaiChars = false;
+
+               p = text.begin();
+               ch = nextUtf8Char(p, charStart, text.end());
+               lastChar = 0;
+               int charIndex = 0;
+               while (ch) {
+                       thaiChar = th_uni2tis(ch);
+                       if (thaiChar >= 0x80 && thaiChar != THCHAR_ERR) {
+                               hasThaiChars = true;
+                       }
+                       tisText += (char)thaiChar;
+                       charSizes += (char)(p - charStart);
+
+                       if (isLetter(ch)) {
+                               if (lastChar && !isLetter(lastChar)) {
+                                       breaks.insert(charIndex);
+                               }
+                       } else {
+                               breaks.insert(charIndex);
+                       }
+                       charIndex++;
+                       lastChar = ch;
+                       ch = nextUtf8Char(p, charStart, text.end());
+               }
+
+               // If there were any Thai characters in the string, run th_brk 
on it and add
+               // the resulting break positions
+               if (hasThaiChars) {
+                       IntVector thaiBreakPositions;
+                       tisText += '\0';
+                       thaiBreakPositions.resize(tisText.size());
+                       int numBreaks = th_brk((const 
thchar_t*)(tisText.data()),
+                                       &thaiBreakPositions[0], 
thaiBreakPositions.size());
+                       thaiBreakPositions.resize(numBreaks);
+                       breaks.insert(thaiBreakPositions.begin(), 
thaiBreakPositions.end());
+               }
+
+               // Add a fake end-of-string character and have a break on it, 
so that the
+               // last word gets added without special handling
+               breaks.insert(charSizes.size());
+               charSizes += (char)0;
+
+               // Now make the word array by traversing the breaks set
+               p = text.begin();
+               IntSet::iterator pBrk = breaks.begin();
+               String::const_iterator wordStart = text.begin();
+               String::const_iterator suffixStart = text.end();
+
+               // If there's a break at the start of the string, skip it
+               if (pBrk != breaks.end() && *pBrk == 0) {
+                       pBrk++;
+               }
+
+               for (charIndex = 0; charIndex < charSizes.size(); p += 
charSizes[charIndex++]) {
+                       // Assume all spaces are ASCII
+                       if (isSpace(*p)) {
+                               suffixStart = p;
+                       }
+                       if (pBrk != breaks.end() && charIndex == *pBrk) {
+                               if (suffixStart == text.end()) {
+                                       words.push_back(Word(wordStart, p, p));
+                               } else {
+                                       words.push_back(Word(wordStart, 
suffixStart, p));
+                               }
+                               pBrk++;
+                               suffixStart = text.end();
+                               wordStart = p;
+                       }
+               }
+       }
+}
+
+#endif // TEXTUTIL_H

-- 
To view, visit https://gerrit.wikimedia.org/r/356582
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: If0952aa53e472c23237890ee6d6997ac9d6d7c0e
Gerrit-PatchSet: 24
Gerrit-Project: mediawiki/php/wikidiff2
Gerrit-Branch: master
Gerrit-Owner: Jkroll <johannes.kr...@wikimedia.de>
Gerrit-Reviewer: Awight <awi...@wikimedia.org>
Gerrit-Reviewer: Daniel Kinzler <daniel.kinz...@wikimedia.de>
Gerrit-Reviewer: Jkroll <johannes.kr...@wikimedia.de>
Gerrit-Reviewer: MaxSem <maxsem.w...@gmail.com>
Gerrit-Reviewer: Tim Starling <tstarl...@wikimedia.org>
Gerrit-Reviewer: Tobias Gritschacher <tobias.gritschac...@wikimedia.de>
Gerrit-Reviewer: WMDE-Fisch <christoph.jau...@wikimedia.de>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] mediawiki...wikidiff2[master]: Better change/add/delete distinction for DiffOps

Reply via email to