[PATCH] UTF-8 support for clang-format.

Alexander Kornienko Tue, 04 Jun 2013 16:59:22 -0700

Hi klimek, djasper,

Detect if the file is valid UTF-8, and if this is the case, count code
points instead of just using number of bytes in all (hopefully) places, where
number of columns is needed. In particular, use the new
FormatToken.CodePointCount instead of TokenLength where appropriate.
Changed BreakableToken implementations to respect utf-8 character boundaries
when in utf-8 mode.


http://llvm-reviews.chandlerc.com/D918

Files:
  lib/Format/BreakableToken.cpp
  lib/Format/BreakableToken.h
  lib/Format/Format.cpp
  lib/Format/FormatToken.h
  lib/Format/Utils.h
  unittests/Format/FormatTest.cpp

Index: lib/Format/BreakableToken.cpp
===================================================================
--- lib/Format/BreakableToken.cpp
+++ lib/Format/BreakableToken.cpp
@@ -15,6 +15,7 @@
 
 #define DEBUG_TYPE "format-token-breaker"
 
+#include "Utils.h"
 #include "BreakableToken.h"
 #include "clang/Format/Format.h"
 #include "llvm/ADT/STLExtras.h"
@@ -25,66 +26,21 @@
 namespace format {
 namespace {
 
-// FIXME: Move helper string functions to where it makes sense.
-
-unsigned getOctalLength(StringRef Text) {
-  unsigned I = 1;
-  while (I < Text.size() && I < 4 && (Text[I] >= '0' && Text[I] <= '7')) {
-    ++I;
-  }
-  return I;
-}
-
-unsigned getHexLength(StringRef Text) {
-  unsigned I = 2; // Point after '\x'.
-  while (I < Text.size() && ((Text[I] >= '0' && Text[I] <= '9') ||
-                             (Text[I] >= 'a' && Text[I] <= 'f') ||
-                             (Text[I] >= 'A' && Text[I] <= 'F'))) {
-    ++I;
-  }
-  return I;
-}
-
-unsigned getEscapeSequenceLength(StringRef Text) {
-  assert(Text[0] == '\\');
-  if (Text.size() < 2)
-    return 1;
-
-  switch (Text[1]) {
-  case 'u':
-    return 6;
-  case 'U':
-    return 10;
-  case 'x':
-    return getHexLength(Text);
-  default:
-    if (Text[1] >= '0' && Text[1] <= '7')
-      return getOctalLength(Text);
-    return 2;
-  }
-}
-
-StringRef::size_type getStartOfCharacter(StringRef Text,
-                                         StringRef::size_type Offset) {
-  StringRef::size_type NextEscape = Text.find('\\');
-  while (NextEscape != StringRef::npos && NextEscape < Offset) {
-    StringRef::size_type SequenceLength =
-        getEscapeSequenceLength(Text.substr(NextEscape));
-    if (Offset < NextEscape + SequenceLength)
-      return NextEscape;
-    NextEscape = Text.find('\\', NextEscape + SequenceLength);
-  }
-  return Offset;
-}
-
 BreakableToken::Split getCommentSplit(StringRef Text,
                                       unsigned ContentStartColumn,
-                                      unsigned ColumnLimit) {
+                                      unsigned ColumnLimit,
+                                      utils::Encoding Encoding) {
   if (ColumnLimit <= ContentStartColumn + 1)
     return BreakableToken::Split(StringRef::npos, 0);
 
   unsigned MaxSplit = ColumnLimit - ContentStartColumn + 1;
-  StringRef::size_type SpaceOffset = Text.rfind(' ', MaxSplit);
+  unsigned MaxSplitBytes = 0;
+
+  for (unsigned NumChars = 0;
+       NumChars < MaxSplit && MaxSplitBytes < Text.size(); ++NumChars)
+    MaxSplitBytes += utils::getCodePointNumBytes(Text[MaxSplitBytes], Encoding);
+
+  StringRef::size_type SpaceOffset = Text.rfind(' ', MaxSplitBytes);
   if (SpaceOffset == StringRef::npos ||
       // Don't break at leading whitespace.
       Text.find_last_not_of(' ', SpaceOffset) == StringRef::npos) {
@@ -95,7 +51,7 @@
       // If the comment is only whitespace, we cannot split.
       return BreakableToken::Split(StringRef::npos, 0);
     SpaceOffset =
-        Text.find(' ', std::max<unsigned>(MaxSplit, FirstNonWhitespace));
+        Text.find(' ', std::max<unsigned>(MaxSplitBytes, FirstNonWhitespace));
   }
   if (SpaceOffset != StringRef::npos && SpaceOffset != 0) {
     StringRef BeforeCut = Text.substr(0, SpaceOffset).rtrim();
@@ -108,25 +64,48 @@
 
 BreakableToken::Split getStringSplit(StringRef Text,
                                      unsigned ContentStartColumn,
-                                     unsigned ColumnLimit) {
-
-  if (ColumnLimit <= ContentStartColumn)
-    return BreakableToken::Split(StringRef::npos, 0);
-  unsigned MaxSplit = ColumnLimit - ContentStartColumn;
+                                     unsigned ColumnLimit,
+                                     utils::Encoding Encoding) {
   // FIXME: Reduce unit test case.
   if (Text.empty())
     return BreakableToken::Split(StringRef::npos, 0);
-  MaxSplit = std::min<unsigned>(MaxSplit, Text.size() - 1);
-  StringRef::size_type SpaceOffset = Text.rfind(' ', MaxSplit);
-  if (SpaceOffset != StringRef::npos && SpaceOffset != 0)
+  if (ColumnLimit <= ContentStartColumn)
+    return BreakableToken::Split(StringRef::npos, 0);
+  unsigned MaxSplit =
+      std::min<unsigned>(ColumnLimit - ContentStartColumn,
+                         utils::getCodePointCount(Text, Encoding) - 1);
+  StringRef::size_type SpaceOffset = 0;
+  StringRef::size_type SlashOffset = 0;
+  StringRef::size_type SplitPoint = 0;
+  for (unsigned Chars = 0;;) {
+    unsigned Advance;
+    if (Text[0] == '\\') {
+      Advance = utils::getEscapeSequenceLength(Text);
+      Chars += Advance;
+    } else {
+      Advance = utils::getCodePointNumBytes(Text[0], Encoding);
+      Chars += 1;
+    }
+
+    if (Chars > MaxSplit)
+      break;
+
+    if (Text[0] == ' ')
+      SpaceOffset = SplitPoint;
+    if (Text[0] == '/')
+      SlashOffset = SplitPoint;
+
+    SplitPoint += Advance;
+    Text = Text.substr(Advance);
+  }
+
+  if (SpaceOffset != 0)
     return BreakableToken::Split(SpaceOffset + 1, 0);
-  StringRef::size_type SlashOffset = Text.rfind('/', MaxSplit);
-  if (SlashOffset != StringRef::npos && SlashOffset != 0)
+  if (SlashOffset != 0)
     return BreakableToken::Split(SlashOffset + 1, 0);
-  StringRef::size_type SplitPoint = getStartOfCharacter(Text, MaxSplit);
-  if (SplitPoint == StringRef::npos || SplitPoint == 0)
-    return BreakableToken::Split(StringRef::npos, 0);
-  return BreakableToken::Split(SplitPoint, 0);
+  if (SplitPoint != 0)
+    return BreakableToken::Split(SplitPoint, 0);
+  return BreakableToken::Split(StringRef::npos, 0);
 }
 
 } // namespace
@@ -136,8 +115,8 @@
 unsigned
 BreakableSingleLineToken::getLineLengthAfterSplit(unsigned LineIndex,
                                                   unsigned TailOffset) const {
-  return StartColumn + Prefix.size() + Postfix.size() + Line.size() -
-         TailOffset;
+  return StartColumn + Prefix.size() + Postfix.size() +
+         utils::getCodePointCount(Line.substr(TailOffset), Encoding);
 }
 
 void BreakableSingleLineToken::insertBreak(unsigned LineIndex,
@@ -152,22 +131,25 @@
 BreakableSingleLineToken::BreakableSingleLineToken(const FormatToken &Tok,
                                                    unsigned StartColumn,
                                                    StringRef Prefix,
-                                                   StringRef Postfix)
-    : BreakableToken(Tok), StartColumn(StartColumn), Prefix(Prefix),
+                                                   StringRef Postfix,
+                                                   utils::Encoding Encoding)
+    : BreakableToken(Tok, Encoding), StartColumn(StartColumn), Prefix(Prefix),
       Postfix(Postfix) {
   assert(Tok.TokenText.startswith(Prefix) && Tok.TokenText.endswith(Postfix));
   Line = Tok.TokenText.substr(
       Prefix.size(), Tok.TokenText.size() - Prefix.size() - Postfix.size());
 }
 
 BreakableStringLiteral::BreakableStringLiteral(const FormatToken &Tok,
-                                               unsigned StartColumn)
-    : BreakableSingleLineToken(Tok, StartColumn, "\"", "\"") {}
+                                               unsigned StartColumn,
+                                               utils::Encoding Encoding)
+    : BreakableSingleLineToken(Tok, StartColumn, "\"", "\"", Encoding) {}
 
 BreakableToken::Split
 BreakableStringLiteral::getSplit(unsigned LineIndex, unsigned TailOffset,
                                  unsigned ColumnLimit) const {
-  return getStringSplit(Line.substr(TailOffset), StartColumn + 2, ColumnLimit);
+  return getStringSplit(Line.substr(TailOffset), StartColumn + 2, ColumnLimit,
+                        Encoding);
 }
 
 static StringRef getLineCommentPrefix(StringRef Comment) {
@@ -179,23 +161,23 @@
 }
 
 BreakableLineComment::BreakableLineComment(const FormatToken &Token,
-                                           unsigned StartColumn)
+                                           unsigned StartColumn,
+                                           utils::Encoding Encoding)
     : BreakableSingleLineToken(Token, StartColumn,
-                               getLineCommentPrefix(Token.TokenText), "") {}
+                               getLineCommentPrefix(Token.TokenText), "",
+                               Encoding) {}
 
 BreakableToken::Split
 BreakableLineComment::getSplit(unsigned LineIndex, unsigned TailOffset,
                                unsigned ColumnLimit) const {
   return getCommentSplit(Line.substr(TailOffset), StartColumn + Prefix.size(),
-                         ColumnLimit);
+                         ColumnLimit, Encoding);
 }
 
-BreakableBlockComment::BreakableBlockComment(const FormatStyle &Style,
-                                             const FormatToken &Token,
-                                             unsigned StartColumn,
-                                             unsigned OriginalStartColumn,
-                                             bool FirstInLine)
-    : BreakableToken(Token) {
+BreakableBlockComment::BreakableBlockComment(
+    const FormatStyle &Style, const FormatToken &Token, unsigned StartColumn,
+    unsigned OriginalStartColumn, bool FirstInLine, utils::Encoding Encoding)
+    : BreakableToken(Token, Encoding) {
   StringRef TokenText(Token.TokenText);
   assert(TokenText.startswith("/*") && TokenText.endswith("*/"));
   TokenText.substr(2, TokenText.size() - 4).split(Lines, "\n");
@@ -290,7 +272,8 @@
 BreakableBlockComment::getLineLengthAfterSplit(unsigned LineIndex,
                                                unsigned TailOffset) const {
   return getContentStartColumn(LineIndex, TailOffset) +
-         (Lines[LineIndex].size() - TailOffset) +
+         utils::getCodePointCount(Lines[LineIndex].substr(TailOffset),
+                                  Encoding) +
          // The last line gets a "*/" postfix.
          (LineIndex + 1 == Lines.size() ? 2 : 0);
 }
@@ -300,7 +283,7 @@
                                 unsigned ColumnLimit) const {
   return getCommentSplit(Lines[LineIndex].substr(TailOffset),
                          getContentStartColumn(LineIndex, TailOffset),
-                         ColumnLimit);
+                         ColumnLimit, Encoding);
 }
 
 void BreakableBlockComment::insertBreak(unsigned LineIndex, unsigned TailOffset,
Index: lib/Format/BreakableToken.h
===================================================================
--- lib/Format/BreakableToken.h
+++ lib/Format/BreakableToken.h
@@ -18,6 +18,7 @@
 #define LLVM_CLANG_FORMAT_BREAKABLETOKEN_H
 
 #include "TokenAnnotator.h"
+#include "Utils.h"
 #include "WhitespaceManager.h"
 #include <utility>
 
@@ -65,9 +66,11 @@
                                        WhitespaceManager &Whitespaces) {}
 
 protected:
-  BreakableToken(const FormatToken &Tok) : Tok(Tok) {}
+  BreakableToken(const FormatToken &Tok, utils::Encoding Encoding)
+      : Tok(Tok), Encoding(Encoding) {}
 
   const FormatToken &Tok;
+  utils::Encoding Encoding;
 };
 
 /// \brief Base class for single line tokens that can be broken.
@@ -83,7 +86,8 @@
 
 protected:
   BreakableSingleLineToken(const FormatToken &Tok, unsigned StartColumn,
-                           StringRef Prefix, StringRef Postfix);
+                           StringRef Prefix, StringRef Postfix,
+                           utils::Encoding Encoding);
 
   // The column in which the token starts.
   unsigned StartColumn;
@@ -101,7 +105,8 @@
   ///
   /// \p StartColumn specifies the column in which the token will start
   /// after formatting.
-  BreakableStringLiteral(const FormatToken &Tok, unsigned StartColumn);
+  BreakableStringLiteral(const FormatToken &Tok, unsigned StartColumn,
+                         utils::Encoding Encoding);
 
   virtual Split getSplit(unsigned LineIndex, unsigned TailOffset,
                          unsigned ColumnLimit) const;
@@ -113,7 +118,8 @@
   ///
   /// \p StartColumn specifies the column in which the comment will start
   /// after formatting.
-  BreakableLineComment(const FormatToken &Token, unsigned StartColumn);
+  BreakableLineComment(const FormatToken &Token, unsigned StartColumn,
+                       utils::Encoding Encoding);
 
   virtual Split getSplit(unsigned LineIndex, unsigned TailOffset,
                          unsigned ColumnLimit) const;
@@ -129,7 +135,7 @@
   /// If the comment starts a line after formatting, set \p FirstInLine to true.
   BreakableBlockComment(const FormatStyle &Style, const FormatToken &Token,
                         unsigned StartColumn, unsigned OriginaStartColumn,
-                        bool FirstInLine);
+                        bool FirstInLine, utils::Encoding Encoding);
 
   virtual unsigned getLineCount() const;
   virtual unsigned getLineLengthAfterSplit(unsigned LineIndex,
Index: lib/Format/Format.cpp
===================================================================
--- lib/Format/Format.cpp
+++ lib/Format/Format.cpp
@@ -18,6 +18,7 @@
 #include "BreakableToken.h"
 #include "TokenAnnotator.h"
 #include "UnwrappedLineParser.h"
+#include "Utils.h"
 #include "WhitespaceManager.h"
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/OperatorPrecedence.h"
@@ -243,10 +244,11 @@
   UnwrappedLineFormatter(const FormatStyle &Style, SourceManager &SourceMgr,
                          const AnnotatedLine &Line, unsigned FirstIndent,
                          const FormatToken *RootToken,
-                         WhitespaceManager &Whitespaces)
+                         WhitespaceManager &Whitespaces,
+                         utils::Encoding Encoding)
       : Style(Style), SourceMgr(SourceMgr), Line(Line),
         FirstIndent(FirstIndent), RootToken(RootToken),
-        Whitespaces(Whitespaces), Count(0) {}
+        Whitespaces(Whitespaces), Count(0), Encoding(Encoding) {}
 
   /// \brief Formats an \c UnwrappedLine.
   void format(const AnnotatedLine *NextLine) {
@@ -484,7 +486,7 @@
                                  State.NextToken->WhitespaceRange.getEnd()) -
                              SourceMgr.getSpellingColumnNumber(
                                  State.NextToken->WhitespaceRange.getBegin());
-      State.Column += WhitespaceLength + State.NextToken->TokenLength;
+      State.Column += WhitespaceLength + State.NextToken->CodePointCount;
       State.NextToken = State.NextToken->Next;
       return 0;
     }
@@ -520,11 +522,11 @@
                   Line.StartsDefinition)) {
         State.Column = State.Stack.back().Indent;
       } else if (Current.Type == TT_ObjCSelectorName) {
-        if (State.Stack.back().ColonPos > Current.TokenLength) {
-          State.Column = State.Stack.back().ColonPos - Current.TokenLength;
+        if (State.Stack.back().ColonPos > Current.CodePointCount) {
+          State.Column = State.Stack.back().ColonPos - Current.CodePointCount;
         } else {
           State.Column = State.Stack.back().Indent;
-          State.Stack.back().ColonPos = State.Column + Current.TokenLength;
+          State.Stack.back().ColonPos = State.Column + Current.CodePointCount;
         }
       } else if (Current.Type == TT_StartOfName ||
                  Previous.isOneOf(tok::coloncolon, tok::equal) ||
@@ -560,7 +562,7 @@
       State.Stack.back().LastSpace = State.Column;
       if (Current.isOneOf(tok::arrow, tok::period) &&
           Current.Type != TT_DesignatedInitializerPeriod)
-        State.Stack.back().LastSpace += Current.TokenLength;
+        State.Stack.back().LastSpace += Current.CodePointCount;
       State.StartOfLineLevel = State.ParenLevel;
       State.LowestCallLevel = State.ParenLevel;
 
@@ -595,8 +597,8 @@
         State.Stack.back().VariablePos = State.Column;
         // Move over * and & if they are bound to the variable name.
         const FormatToken *Tok = &Previous;
-        while (Tok && State.Stack.back().VariablePos >= Tok->TokenLength) {
-          State.Stack.back().VariablePos -= Tok->TokenLength;
+        while (Tok && State.Stack.back().VariablePos >= Tok->CodePointCount) {
+          State.Stack.back().VariablePos -= Tok->CodePointCount;
           if (Tok->SpacesRequiredBefore != 0)
             break;
           Tok = Tok->Previous;
@@ -614,12 +616,12 @@
       if (Current.Type == TT_ObjCSelectorName &&
           State.Stack.back().ColonPos == 0) {
         if (State.Stack.back().Indent + Current.LongestObjCSelectorName >
-            State.Column + Spaces + Current.TokenLength)
+            State.Column + Spaces + Current.CodePointCount)
           State.Stack.back().ColonPos =
               State.Stack.back().Indent + Current.LongestObjCSelectorName;
         else
           State.Stack.back().ColonPos =
-              State.Column + Spaces + Current.TokenLength;
+              State.Column + Spaces + Current.CodePointCount;
       }
 
       if (Previous.opensScope() && Previous.Type != TT_ObjCMethodExpr &&
@@ -671,7 +673,7 @@
       State.LowestCallLevel = std::min(State.LowestCallLevel, State.ParenLevel);
       if (Line.Type == LT_BuilderTypeCall && State.ParenLevel == 0)
         State.Stack.back().StartOfFunctionCall =
-            Current.LastInChainOfCalls ? 0 : State.Column + Current.TokenLength;
+            Current.LastInChainOfCalls ? 0 : State.Column + Current.CodePointCount;
     }
     if (Current.Type == TT_CtorInitializerColon) {
       // Indent 2 from the column, so:
@@ -779,7 +781,7 @@
       State.StartOfStringLiteral = 0;
     }
 
-    State.Column += Current.TokenLength;
+    State.Column += Current.CodePointCount;
 
     State.NextToken = State.NextToken->Next;
 
@@ -798,7 +800,7 @@
                                 bool DryRun) {
     unsigned UnbreakableTailLength = Current.UnbreakableTailLength;
     llvm::OwningPtr<BreakableToken> Token;
-    unsigned StartColumn = State.Column - Current.TokenLength;
+    unsigned StartColumn = State.Column - Current.CodePointCount;
     unsigned OriginalStartColumn =
         SourceMgr.getSpellingColumnNumber(Current.getStartOfNonWhitespace()) -
         1;
@@ -811,15 +813,16 @@
       if (!LiteralData || *LiteralData != '"')
         return 0;
 
-      Token.reset(new BreakableStringLiteral(Current, StartColumn));
+      Token.reset(new BreakableStringLiteral(Current, StartColumn, Encoding));
     } else if (Current.Type == TT_BlockComment) {
       BreakableBlockComment *BBC = new BreakableBlockComment(
-          Style, Current, StartColumn, OriginalStartColumn, !Current.Previous);
+          Style, Current, StartColumn, OriginalStartColumn, !Current.Previous,
+          Encoding);
       Token.reset(BBC);
     } else if (Current.Type == TT_LineComment &&
                (Current.Previous == NULL ||
                 Current.Previous->Type != TT_ImplicitStringLiteral)) {
-      Token.reset(new BreakableLineComment(Current, StartColumn));
+      Token.reset(new BreakableLineComment(Current, StartColumn, Encoding));
     } else {
       return 0;
     }
@@ -1080,13 +1083,16 @@
   // Increasing count of \c StateNode items we have created. This is used
   // to create a deterministic order independent of the container.
   unsigned Count;
+  utils::Encoding Encoding;
 };
 
 class FormatTokenLexer {
 public:
-  FormatTokenLexer(Lexer &Lex, SourceManager &SourceMgr)
+  FormatTokenLexer(Lexer &Lex, SourceManager &SourceMgr,
+                   utils::Encoding Encoding)
       : FormatTok(NULL), GreaterStashed(false), TrailingWhitespace(0), Lex(Lex),
-        SourceMgr(SourceMgr), IdentTable(Lex.getLangOpts()) {
+        SourceMgr(SourceMgr), IdentTable(Lex.getLangOpts()),
+        Encoding(Encoding) {
     Lex.SetKeepWhitespaceMode(true);
   }
 
@@ -1112,6 +1118,7 @@
       FormatTok->WhitespaceRange =
           SourceRange(GreaterLocation, GreaterLocation);
       FormatTok->TokenLength = 1;
+      FormatTok->CodePointCount = 1;
       GreaterStashed = false;
       return FormatTok;
     }
@@ -1180,6 +1187,10 @@
       GreaterStashed = true;
     }
 
+    unsigned EncodingExtraBytes =
+        Text.size() - utils::getCodePointCount(Text, Encoding);
+    FormatTok->CodePointCount = FormatTok->TokenLength - EncodingExtraBytes;
+
     FormatTok->WhitespaceRange = SourceRange(
         WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
     FormatTok->TokenText = StringRef(
@@ -1194,6 +1205,7 @@
   Lexer &Lex;
   SourceManager &SourceMgr;
   IdentifierTable IdentTable;
+  utils::Encoding Encoding;
   llvm::SpecificBumpPtrAllocator<FormatToken> Allocator;
   SmallVector<FormatToken *, 16> Tokens;
 
@@ -1209,12 +1221,17 @@
   Formatter(const FormatStyle &Style, Lexer &Lex, SourceManager &SourceMgr,
             const std::vector<CharSourceRange> &Ranges)
       : Style(Style), Lex(Lex), SourceMgr(SourceMgr),
-        Whitespaces(SourceMgr, Style), Ranges(Ranges) {}
+        Whitespaces(SourceMgr, Style), Ranges(Ranges),
+        Encoding(utils::detectEncoding(Lex.getBuffer())) {
+    DEBUG(llvm::dbgs()
+          << "File encoding: "
+          << (Encoding == utils::Encoding_UTF8 ? "UTF8" : "unknown") << "\n");
+  }
 
   virtual ~Formatter() {}
 
   tooling::Replacements format() {
-    FormatTokenLexer Tokens(Lex, SourceMgr);
+    FormatTokenLexer Tokens(Lex, SourceMgr, Encoding);
 
     UnwrappedLineParser Parser(Style, Tokens.lex(), *this);
     bool StructuralError = Parser.parse();
@@ -1290,7 +1307,7 @@
               1;
         }
         UnwrappedLineFormatter Formatter(Style, SourceMgr, TheLine, Indent,
-                                         TheLine.First, Whitespaces);
+                                         TheLine.First, Whitespaces, Encoding);
         Formatter.format(I + 1 != E ? &*(I + 1) : NULL);
         IndentForLevel[TheLine.Level] = LevelIndent;
         PreviousLineWasTouched = true;
@@ -1616,6 +1633,8 @@
   WhitespaceManager Whitespaces;
   std::vector<CharSourceRange> Ranges;
   std::vector<AnnotatedLine> AnnotatedLines;
+
+  utils::Encoding Encoding;
 };
 
 tooling::Replacements reformat(const FormatStyle &Style, Lexer &Lex,
Index: lib/Format/FormatToken.h
===================================================================
--- lib/Format/FormatToken.h
+++ lib/Format/FormatToken.h
@@ -61,11 +61,12 @@
 struct FormatToken {
   FormatToken()
       : NewlinesBefore(0), HasUnescapedNewline(false), LastNewlineOffset(0),
-        TokenLength(0), IsFirst(false), MustBreakBefore(false),
-        Type(TT_Unknown), SpacesRequiredBefore(0), CanBreakBefore(false),
-        ClosesTemplateDeclaration(false), ParameterCount(0), TotalLength(0),
-        UnbreakableTailLength(0), BindingStrength(0), SplitPenalty(0),
-        LongestObjCSelectorName(0), FakeRParens(0), LastInChainOfCalls(false),
+        TokenLength(0), CodePointCount(0), IsFirst(false),
+        MustBreakBefore(false), Type(TT_Unknown), SpacesRequiredBefore(0),
+        CanBreakBefore(false), ClosesTemplateDeclaration(false),
+        ParameterCount(0), TotalLength(0), UnbreakableTailLength(0),
+        BindingStrength(0), SplitPenalty(0), LongestObjCSelectorName(0),
+        FakeRParens(0), LastInChainOfCalls(false),
         PartOfMultiVariableDeclStmt(false), MatchingParen(NULL), Previous(NULL),
         Next(NULL) {}
 
@@ -94,6 +95,10 @@
   /// with the token.
   unsigned TokenLength;
 
+  /// \brief The length of the non-whitespace parts of the token in CodePoints.
+  /// We need this to correctly measure number of columns a token spans.
+  unsigned CodePointCount;
+
   /// \brief Indicates that this is the first token.
   bool IsFirst;
 
Index: lib/Format/Utils.h
===================================================================
--- /dev/null
+++ lib/Format/Utils.h
@@ -0,0 +1,104 @@
+//===--- Utils.h - Format C++ code ----------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief Contains various utility functions.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_FORMAT_UTILS_H
+#define LLVM_CLANG_FORMAT_UTILS_H
+
+#include "clang/Basic/LLVM.h"
+#include "llvm/Support/ConvertUTF.h"
+
+namespace clang {
+namespace format {
+namespace utils {
+
+enum Encoding {
+  Encoding_UTF8,
+  Encoding_Unknown
+};
+
+inline Encoding detectEncoding(StringRef Text) {
+  const UTF8 *Ptr = reinterpret_cast<const UTF8 *>(Text.begin());
+  const UTF8 *BufEnd = reinterpret_cast<const UTF8 *>(Text.end());
+  if (::isLegalUTF8String(&Ptr, BufEnd))
+    return Encoding_UTF8;
+  return Encoding_Unknown;
+}
+
+inline unsigned getCodePointCountUTF8(StringRef Text) {
+  unsigned CodePoints = 0;
+  for (size_t i = 0, e = Text.size(); i < e; i += getNumBytesForUTF8(Text[i])) {
+    ++CodePoints;
+  }
+  return CodePoints;
+}
+
+inline unsigned getCodePointCount(StringRef Text, Encoding Encoding) {
+  switch (Encoding) {
+    case Encoding_UTF8:
+      return getCodePointCountUTF8(Text);
+    default:
+      return Text.size();
+  }
+}
+
+inline unsigned getCodePointNumBytes(char FirstChar, Encoding Encoding) {
+  switch (Encoding) {
+    case Encoding_UTF8:
+      return getNumBytesForUTF8(FirstChar);
+    default:
+      return 1;
+  }
+}
+
+inline bool isOctDigit(char c) {
+  return '0' <= c && c <= '7';
+}
+
+inline bool isHexDigit(char c) {
+  return ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') ||
+         ('A' <= c && c <= 'F');
+}
+
+inline unsigned getEscapeSequenceLength(StringRef Text) {
+  assert(Text[0] == '\\');
+  if (Text.size() < 2)
+    return 1;
+
+  switch (Text[1]) {
+  case 'u':
+    return 6;
+  case 'U':
+    return 10;
+  case 'x': {
+    unsigned I = 2; // Point after '\x'.
+    while (I < Text.size() && isHexDigit(Text[I]))
+      ++I;
+    return I;
+  }
+  default:
+    if (isOctDigit(Text[1])) {
+      unsigned I = 1;
+      while (I < Text.size() && I < 4 && isOctDigit(Text[I]))
+        ++I;
+      return I;
+    }
+    return 2;
+  }
+}
+
+} // namespace utils
+} // namespace format
+} // namespace clang
+
+#endif // LLVM_CLANG_FORMAT_UTILS_H
Index: unittests/Format/FormatTest.cpp
===================================================================
--- unittests/Format/FormatTest.cpp
+++ unittests/Format/FormatTest.cpp
@@ -4873,5 +4873,76 @@
   EXPECT_EQ(Style, ParsedStyle);
 }
 
+TEST_F(FormatTest, WorksFor8bitEncodings) {
+  EXPECT_EQ("\"\xce\xe4\xed\xe0\xe6\xe4\xfb \xe2 \"\n"
+            "\"\xf1\xf2\xf3\xe4\xb8\xed\xf3\xfe \"\n"
+            "\"\xe7\xe8\xec\xed\xfe\xfe \"\n"
+            "\"\xef\xee\xf0\xf3...\"",
+            format("\"\xce\xe4\xed\xe0\xe6\xe4\xfb \xe2 "
+                   "\xf1\xf2\xf3\xe4\xb8\xed\xf3\xfe \xe7\xe8\xec\xed\xfe\xfe "
+                   "\xef\xee\xf0\xf3...\"",
+                   getLLVMStyleWithColumns(12)));
+}
+
+TEST_F(FormatTest, CountsUTF8CharactersProperly) {
+  verifyFormat("\"Однажды в студёную зимнюю пору...\"",
+               getLLVMStyleWithColumns(35));
+  verifyFormat("\"一 二 三 四 五 六 七 八 九 十\"",
+               getLLVMStyleWithColumns(21));
+  verifyFormat("// Однажды в студёную зимнюю пору...",
+               getLLVMStyleWithColumns(36));
+  verifyFormat("// 一 二 三 四 五 六 七 八 九 十",
+               getLLVMStyleWithColumns(22));
+  verifyFormat("/* Однажды в студёную зимнюю пору... */",
+               getLLVMStyleWithColumns(39));
+  verifyFormat("/* 一 二 三 四 五 六 七 八 九 十 */",
+               getLLVMStyleWithColumns(25));
+}
+
+TEST_F(FormatTest, SplitsUTF8Strings) {
+  EXPECT_EQ(
+      "\"Однажды в \"\n"
+      "\"студёную \"\n"
+      "\"зимнюю \"\n"
+      "\"пору...\"",
+      format("\"Однажды в студёную зимнюю пору...\"",
+             getLLVMStyleWithColumns(12)));
+  EXPECT_EQ("\"一 二 三 四 \"\n"
+            "\"五 六 七 八 \"\n"
+            "\"九 十\"",
+            format("\"一 二 三 四 五 六 七 八 九 十\"",
+                   getLLVMStyleWithColumns(10)));
+}
+
+TEST_F(FormatTest, SplitsUTF8LineComments) {
+  EXPECT_EQ("// Однажды в\n"
+            "// студёную\n"
+            "// зимнюю\n"
+            "// пору...",
+            format("// Однажды в студёную   зимнюю   пору...",
+                   getLLVMStyleWithColumns(12)));
+  EXPECT_EQ("// 一二三\n"
+            "// 四五六七\n"
+            "// 八\n"
+            "// 九 十",
+            format("// 一二三 四五六七 八  九 十", getLLVMStyleWithColumns(6)));
+}
+
+TEST_F(FormatTest, SplitsUTF8BlockComments) {
+  EXPECT_EQ("/* Однажды в\n"
+            " * студёную\n"
+            " * зимнюю\n"
+            " * пору...\n"
+            " */",
+            format("/* Однажды в студёную   зимнюю   пору... */",
+                   getLLVMStyleWithColumns(12)));
+  EXPECT_EQ("/* 一二三\n"
+            " * 四五六七\n"
+            " * 八\n"
+            " * 九 十\n"
+            " */",
+            format("/* 一二三 四五六七 八  九 十 */", getLLVMStyleWithColumns(6)));
+}
+
 } // end namespace tooling
 } // end namespace clang

_______________________________________________
cfe-commits mailing list
[email protected]
http://lists.cs.uiuc.edu/mailman/listinfo/cfe-commits

[PATCH] UTF-8 support for clang-format.

Reply via email to