Better tests.
Hi klimek, djasper,
http://llvm-reviews.chandlerc.com/D918
CHANGE SINCE LAST DIFF
http://llvm-reviews.chandlerc.com/D918?vs=2267&id=2268#toc
Files:
lib/Format/BreakableToken.cpp
lib/Format/BreakableToken.h
lib/Format/Format.cpp
lib/Format/FormatToken.h
lib/Format/Utils.h
unittests/Format/FormatTest.cpp
Index: lib/Format/BreakableToken.cpp
===================================================================
--- lib/Format/BreakableToken.cpp
+++ lib/Format/BreakableToken.cpp
@@ -15,6 +15,7 @@
#define DEBUG_TYPE "format-token-breaker"
+#include "Utils.h"
#include "BreakableToken.h"
#include "clang/Format/Format.h"
#include "llvm/ADT/STLExtras.h"
@@ -25,66 +26,21 @@
namespace format {
namespace {
-// FIXME: Move helper string functions to where it makes sense.
-
-unsigned getOctalLength(StringRef Text) {
- unsigned I = 1;
- while (I < Text.size() && I < 4 && (Text[I] >= '0' && Text[I] <= '7')) {
- ++I;
- }
- return I;
-}
-
-unsigned getHexLength(StringRef Text) {
- unsigned I = 2; // Point after '\x'.
- while (I < Text.size() && ((Text[I] >= '0' && Text[I] <= '9') ||
- (Text[I] >= 'a' && Text[I] <= 'f') ||
- (Text[I] >= 'A' && Text[I] <= 'F'))) {
- ++I;
- }
- return I;
-}
-
-unsigned getEscapeSequenceLength(StringRef Text) {
- assert(Text[0] == '\\');
- if (Text.size() < 2)
- return 1;
-
- switch (Text[1]) {
- case 'u':
- return 6;
- case 'U':
- return 10;
- case 'x':
- return getHexLength(Text);
- default:
- if (Text[1] >= '0' && Text[1] <= '7')
- return getOctalLength(Text);
- return 2;
- }
-}
-
-StringRef::size_type getStartOfCharacter(StringRef Text,
- StringRef::size_type Offset) {
- StringRef::size_type NextEscape = Text.find('\\');
- while (NextEscape != StringRef::npos && NextEscape < Offset) {
- StringRef::size_type SequenceLength =
- getEscapeSequenceLength(Text.substr(NextEscape));
- if (Offset < NextEscape + SequenceLength)
- return NextEscape;
- NextEscape = Text.find('\\', NextEscape + SequenceLength);
- }
- return Offset;
-}
-
BreakableToken::Split getCommentSplit(StringRef Text,
unsigned ContentStartColumn,
- unsigned ColumnLimit) {
+ unsigned ColumnLimit,
+ utils::Encoding Encoding) {
if (ColumnLimit <= ContentStartColumn + 1)
return BreakableToken::Split(StringRef::npos, 0);
unsigned MaxSplit = ColumnLimit - ContentStartColumn + 1;
- StringRef::size_type SpaceOffset = Text.rfind(' ', MaxSplit);
+ unsigned MaxSplitBytes = 0;
+
+ for (unsigned NumChars = 0;
+ NumChars < MaxSplit && MaxSplitBytes < Text.size(); ++NumChars)
+ MaxSplitBytes += utils::getCodePointNumBytes(Text[MaxSplitBytes], Encoding);
+
+ StringRef::size_type SpaceOffset = Text.rfind(' ', MaxSplitBytes);
if (SpaceOffset == StringRef::npos ||
// Don't break at leading whitespace.
Text.find_last_not_of(' ', SpaceOffset) == StringRef::npos) {
@@ -95,7 +51,7 @@
// If the comment is only whitespace, we cannot split.
return BreakableToken::Split(StringRef::npos, 0);
SpaceOffset =
- Text.find(' ', std::max<unsigned>(MaxSplit, FirstNonWhitespace));
+ Text.find(' ', std::max<unsigned>(MaxSplitBytes, FirstNonWhitespace));
}
if (SpaceOffset != StringRef::npos && SpaceOffset != 0) {
StringRef BeforeCut = Text.substr(0, SpaceOffset).rtrim();
@@ -108,25 +64,48 @@
BreakableToken::Split getStringSplit(StringRef Text,
unsigned ContentStartColumn,
- unsigned ColumnLimit) {
-
- if (ColumnLimit <= ContentStartColumn)
- return BreakableToken::Split(StringRef::npos, 0);
- unsigned MaxSplit = ColumnLimit - ContentStartColumn;
+ unsigned ColumnLimit,
+ utils::Encoding Encoding) {
// FIXME: Reduce unit test case.
if (Text.empty())
return BreakableToken::Split(StringRef::npos, 0);
- MaxSplit = std::min<unsigned>(MaxSplit, Text.size() - 1);
- StringRef::size_type SpaceOffset = Text.rfind(' ', MaxSplit);
- if (SpaceOffset != StringRef::npos && SpaceOffset != 0)
+ if (ColumnLimit <= ContentStartColumn)
+ return BreakableToken::Split(StringRef::npos, 0);
+ unsigned MaxSplit =
+ std::min<unsigned>(ColumnLimit - ContentStartColumn,
+ utils::getCodePointCount(Text, Encoding) - 1);
+ StringRef::size_type SpaceOffset = 0;
+ StringRef::size_type SlashOffset = 0;
+ StringRef::size_type SplitPoint = 0;
+ for (unsigned Chars = 0;;) {
+ unsigned Advance;
+ if (Text[0] == '\\') {
+ Advance = utils::getEscapeSequenceLength(Text);
+ Chars += Advance;
+ } else {
+ Advance = utils::getCodePointNumBytes(Text[0], Encoding);
+ Chars += 1;
+ }
+
+ if (Chars > MaxSplit)
+ break;
+
+ if (Text[0] == ' ')
+ SpaceOffset = SplitPoint;
+ if (Text[0] == '/')
+ SlashOffset = SplitPoint;
+
+ SplitPoint += Advance;
+ Text = Text.substr(Advance);
+ }
+
+ if (SpaceOffset != 0)
return BreakableToken::Split(SpaceOffset + 1, 0);
- StringRef::size_type SlashOffset = Text.rfind('/', MaxSplit);
- if (SlashOffset != StringRef::npos && SlashOffset != 0)
+ if (SlashOffset != 0)
return BreakableToken::Split(SlashOffset + 1, 0);
- StringRef::size_type SplitPoint = getStartOfCharacter(Text, MaxSplit);
- if (SplitPoint == StringRef::npos || SplitPoint == 0)
- return BreakableToken::Split(StringRef::npos, 0);
- return BreakableToken::Split(SplitPoint, 0);
+ if (SplitPoint != 0)
+ return BreakableToken::Split(SplitPoint, 0);
+ return BreakableToken::Split(StringRef::npos, 0);
}
} // namespace
@@ -136,8 +115,8 @@
unsigned
BreakableSingleLineToken::getLineLengthAfterSplit(unsigned LineIndex,
unsigned TailOffset) const {
- return StartColumn + Prefix.size() + Postfix.size() + Line.size() -
- TailOffset;
+ return StartColumn + Prefix.size() + Postfix.size() +
+ utils::getCodePointCount(Line.substr(TailOffset), Encoding);
}
void BreakableSingleLineToken::insertBreak(unsigned LineIndex,
@@ -152,22 +131,25 @@
BreakableSingleLineToken::BreakableSingleLineToken(const FormatToken &Tok,
unsigned StartColumn,
StringRef Prefix,
- StringRef Postfix)
- : BreakableToken(Tok), StartColumn(StartColumn), Prefix(Prefix),
+ StringRef Postfix,
+ utils::Encoding Encoding)
+ : BreakableToken(Tok, Encoding), StartColumn(StartColumn), Prefix(Prefix),
Postfix(Postfix) {
assert(Tok.TokenText.startswith(Prefix) && Tok.TokenText.endswith(Postfix));
Line = Tok.TokenText.substr(
Prefix.size(), Tok.TokenText.size() - Prefix.size() - Postfix.size());
}
BreakableStringLiteral::BreakableStringLiteral(const FormatToken &Tok,
- unsigned StartColumn)
- : BreakableSingleLineToken(Tok, StartColumn, "\"", "\"") {}
+ unsigned StartColumn,
+ utils::Encoding Encoding)
+ : BreakableSingleLineToken(Tok, StartColumn, "\"", "\"", Encoding) {}
BreakableToken::Split
BreakableStringLiteral::getSplit(unsigned LineIndex, unsigned TailOffset,
unsigned ColumnLimit) const {
- return getStringSplit(Line.substr(TailOffset), StartColumn + 2, ColumnLimit);
+ return getStringSplit(Line.substr(TailOffset), StartColumn + 2, ColumnLimit,
+ Encoding);
}
static StringRef getLineCommentPrefix(StringRef Comment) {
@@ -179,23 +161,23 @@
}
BreakableLineComment::BreakableLineComment(const FormatToken &Token,
- unsigned StartColumn)
+ unsigned StartColumn,
+ utils::Encoding Encoding)
: BreakableSingleLineToken(Token, StartColumn,
- getLineCommentPrefix(Token.TokenText), "") {}
+ getLineCommentPrefix(Token.TokenText), "",
+ Encoding) {}
BreakableToken::Split
BreakableLineComment::getSplit(unsigned LineIndex, unsigned TailOffset,
unsigned ColumnLimit) const {
return getCommentSplit(Line.substr(TailOffset), StartColumn + Prefix.size(),
- ColumnLimit);
+ ColumnLimit, Encoding);
}
-BreakableBlockComment::BreakableBlockComment(const FormatStyle &Style,
- const FormatToken &Token,
- unsigned StartColumn,
- unsigned OriginalStartColumn,
- bool FirstInLine)
- : BreakableToken(Token) {
+BreakableBlockComment::BreakableBlockComment(
+ const FormatStyle &Style, const FormatToken &Token, unsigned StartColumn,
+ unsigned OriginalStartColumn, bool FirstInLine, utils::Encoding Encoding)
+ : BreakableToken(Token, Encoding) {
StringRef TokenText(Token.TokenText);
assert(TokenText.startswith("/*") && TokenText.endswith("*/"));
TokenText.substr(2, TokenText.size() - 4).split(Lines, "\n");
@@ -290,7 +272,8 @@
BreakableBlockComment::getLineLengthAfterSplit(unsigned LineIndex,
unsigned TailOffset) const {
return getContentStartColumn(LineIndex, TailOffset) +
- (Lines[LineIndex].size() - TailOffset) +
+ utils::getCodePointCount(Lines[LineIndex].substr(TailOffset),
+ Encoding) +
// The last line gets a "*/" postfix.
(LineIndex + 1 == Lines.size() ? 2 : 0);
}
@@ -300,7 +283,7 @@
unsigned ColumnLimit) const {
return getCommentSplit(Lines[LineIndex].substr(TailOffset),
getContentStartColumn(LineIndex, TailOffset),
- ColumnLimit);
+ ColumnLimit, Encoding);
}
void BreakableBlockComment::insertBreak(unsigned LineIndex, unsigned TailOffset,
Index: lib/Format/BreakableToken.h
===================================================================
--- lib/Format/BreakableToken.h
+++ lib/Format/BreakableToken.h
@@ -18,6 +18,7 @@
#define LLVM_CLANG_FORMAT_BREAKABLETOKEN_H
#include "TokenAnnotator.h"
+#include "Utils.h"
#include "WhitespaceManager.h"
#include <utility>
@@ -65,9 +66,11 @@
WhitespaceManager &Whitespaces) {}
protected:
- BreakableToken(const FormatToken &Tok) : Tok(Tok) {}
+ BreakableToken(const FormatToken &Tok, utils::Encoding Encoding)
+ : Tok(Tok), Encoding(Encoding) {}
const FormatToken &Tok;
+ utils::Encoding Encoding;
};
/// \brief Base class for single line tokens that can be broken.
@@ -83,7 +86,8 @@
protected:
BreakableSingleLineToken(const FormatToken &Tok, unsigned StartColumn,
- StringRef Prefix, StringRef Postfix);
+ StringRef Prefix, StringRef Postfix,
+ utils::Encoding Encoding);
// The column in which the token starts.
unsigned StartColumn;
@@ -101,7 +105,8 @@
///
/// \p StartColumn specifies the column in which the token will start
/// after formatting.
- BreakableStringLiteral(const FormatToken &Tok, unsigned StartColumn);
+ BreakableStringLiteral(const FormatToken &Tok, unsigned StartColumn,
+ utils::Encoding Encoding);
virtual Split getSplit(unsigned LineIndex, unsigned TailOffset,
unsigned ColumnLimit) const;
@@ -113,7 +118,8 @@
///
/// \p StartColumn specifies the column in which the comment will start
/// after formatting.
- BreakableLineComment(const FormatToken &Token, unsigned StartColumn);
+ BreakableLineComment(const FormatToken &Token, unsigned StartColumn,
+ utils::Encoding Encoding);
virtual Split getSplit(unsigned LineIndex, unsigned TailOffset,
unsigned ColumnLimit) const;
@@ -129,7 +135,7 @@
/// If the comment starts a line after formatting, set \p FirstInLine to true.
BreakableBlockComment(const FormatStyle &Style, const FormatToken &Token,
unsigned StartColumn, unsigned OriginaStartColumn,
- bool FirstInLine);
+ bool FirstInLine, utils::Encoding Encoding);
virtual unsigned getLineCount() const;
virtual unsigned getLineLengthAfterSplit(unsigned LineIndex,
Index: lib/Format/Format.cpp
===================================================================
--- lib/Format/Format.cpp
+++ lib/Format/Format.cpp
@@ -18,6 +18,7 @@
#include "BreakableToken.h"
#include "TokenAnnotator.h"
#include "UnwrappedLineParser.h"
+#include "Utils.h"
#include "WhitespaceManager.h"
#include "clang/Basic/Diagnostic.h"
#include "clang/Basic/OperatorPrecedence.h"
@@ -243,10 +244,11 @@
UnwrappedLineFormatter(const FormatStyle &Style, SourceManager &SourceMgr,
const AnnotatedLine &Line, unsigned FirstIndent,
const FormatToken *RootToken,
- WhitespaceManager &Whitespaces)
+ WhitespaceManager &Whitespaces,
+ utils::Encoding Encoding)
: Style(Style), SourceMgr(SourceMgr), Line(Line),
FirstIndent(FirstIndent), RootToken(RootToken),
- Whitespaces(Whitespaces), Count(0) {}
+ Whitespaces(Whitespaces), Count(0), Encoding(Encoding) {}
/// \brief Formats an \c UnwrappedLine.
void format(const AnnotatedLine *NextLine) {
@@ -484,7 +486,7 @@
State.NextToken->WhitespaceRange.getEnd()) -
SourceMgr.getSpellingColumnNumber(
State.NextToken->WhitespaceRange.getBegin());
- State.Column += WhitespaceLength + State.NextToken->TokenLength;
+ State.Column += WhitespaceLength + State.NextToken->CodePointCount;
State.NextToken = State.NextToken->Next;
return 0;
}
@@ -520,11 +522,11 @@
Line.StartsDefinition)) {
State.Column = State.Stack.back().Indent;
} else if (Current.Type == TT_ObjCSelectorName) {
- if (State.Stack.back().ColonPos > Current.TokenLength) {
- State.Column = State.Stack.back().ColonPos - Current.TokenLength;
+ if (State.Stack.back().ColonPos > Current.CodePointCount) {
+ State.Column = State.Stack.back().ColonPos - Current.CodePointCount;
} else {
State.Column = State.Stack.back().Indent;
- State.Stack.back().ColonPos = State.Column + Current.TokenLength;
+ State.Stack.back().ColonPos = State.Column + Current.CodePointCount;
}
} else if (Current.Type == TT_StartOfName ||
Previous.isOneOf(tok::coloncolon, tok::equal) ||
@@ -560,7 +562,7 @@
State.Stack.back().LastSpace = State.Column;
if (Current.isOneOf(tok::arrow, tok::period) &&
Current.Type != TT_DesignatedInitializerPeriod)
- State.Stack.back().LastSpace += Current.TokenLength;
+ State.Stack.back().LastSpace += Current.CodePointCount;
State.StartOfLineLevel = State.ParenLevel;
State.LowestCallLevel = State.ParenLevel;
@@ -595,8 +597,8 @@
State.Stack.back().VariablePos = State.Column;
// Move over * and & if they are bound to the variable name.
const FormatToken *Tok = &Previous;
- while (Tok && State.Stack.back().VariablePos >= Tok->TokenLength) {
- State.Stack.back().VariablePos -= Tok->TokenLength;
+ while (Tok && State.Stack.back().VariablePos >= Tok->CodePointCount) {
+ State.Stack.back().VariablePos -= Tok->CodePointCount;
if (Tok->SpacesRequiredBefore != 0)
break;
Tok = Tok->Previous;
@@ -614,12 +616,12 @@
if (Current.Type == TT_ObjCSelectorName &&
State.Stack.back().ColonPos == 0) {
if (State.Stack.back().Indent + Current.LongestObjCSelectorName >
- State.Column + Spaces + Current.TokenLength)
+ State.Column + Spaces + Current.CodePointCount)
State.Stack.back().ColonPos =
State.Stack.back().Indent + Current.LongestObjCSelectorName;
else
State.Stack.back().ColonPos =
- State.Column + Spaces + Current.TokenLength;
+ State.Column + Spaces + Current.CodePointCount;
}
if (Previous.opensScope() && Previous.Type != TT_ObjCMethodExpr &&
@@ -671,7 +673,7 @@
State.LowestCallLevel = std::min(State.LowestCallLevel, State.ParenLevel);
if (Line.Type == LT_BuilderTypeCall && State.ParenLevel == 0)
State.Stack.back().StartOfFunctionCall =
- Current.LastInChainOfCalls ? 0 : State.Column + Current.TokenLength;
+ Current.LastInChainOfCalls ? 0 : State.Column + Current.CodePointCount;
}
if (Current.Type == TT_CtorInitializerColon) {
// Indent 2 from the column, so:
@@ -779,7 +781,7 @@
State.StartOfStringLiteral = 0;
}
- State.Column += Current.TokenLength;
+ State.Column += Current.CodePointCount;
State.NextToken = State.NextToken->Next;
@@ -798,7 +800,7 @@
bool DryRun) {
unsigned UnbreakableTailLength = Current.UnbreakableTailLength;
llvm::OwningPtr<BreakableToken> Token;
- unsigned StartColumn = State.Column - Current.TokenLength;
+ unsigned StartColumn = State.Column - Current.CodePointCount;
unsigned OriginalStartColumn =
SourceMgr.getSpellingColumnNumber(Current.getStartOfNonWhitespace()) -
1;
@@ -811,15 +813,16 @@
if (!LiteralData || *LiteralData != '"')
return 0;
- Token.reset(new BreakableStringLiteral(Current, StartColumn));
+ Token.reset(new BreakableStringLiteral(Current, StartColumn, Encoding));
} else if (Current.Type == TT_BlockComment) {
BreakableBlockComment *BBC = new BreakableBlockComment(
- Style, Current, StartColumn, OriginalStartColumn, !Current.Previous);
+ Style, Current, StartColumn, OriginalStartColumn, !Current.Previous,
+ Encoding);
Token.reset(BBC);
} else if (Current.Type == TT_LineComment &&
(Current.Previous == NULL ||
Current.Previous->Type != TT_ImplicitStringLiteral)) {
- Token.reset(new BreakableLineComment(Current, StartColumn));
+ Token.reset(new BreakableLineComment(Current, StartColumn, Encoding));
} else {
return 0;
}
@@ -1080,13 +1083,16 @@
// Increasing count of \c StateNode items we have created. This is used
// to create a deterministic order independent of the container.
unsigned Count;
+ utils::Encoding Encoding;
};
class FormatTokenLexer {
public:
- FormatTokenLexer(Lexer &Lex, SourceManager &SourceMgr)
+ FormatTokenLexer(Lexer &Lex, SourceManager &SourceMgr,
+ utils::Encoding Encoding)
: FormatTok(NULL), GreaterStashed(false), TrailingWhitespace(0), Lex(Lex),
- SourceMgr(SourceMgr), IdentTable(Lex.getLangOpts()) {
+ SourceMgr(SourceMgr), IdentTable(Lex.getLangOpts()),
+ Encoding(Encoding) {
Lex.SetKeepWhitespaceMode(true);
}
@@ -1112,6 +1118,7 @@
FormatTok->WhitespaceRange =
SourceRange(GreaterLocation, GreaterLocation);
FormatTok->TokenLength = 1;
+ FormatTok->CodePointCount = 1;
GreaterStashed = false;
return FormatTok;
}
@@ -1180,6 +1187,10 @@
GreaterStashed = true;
}
+ unsigned EncodingExtraBytes =
+ Text.size() - utils::getCodePointCount(Text, Encoding);
+ FormatTok->CodePointCount = FormatTok->TokenLength - EncodingExtraBytes;
+
FormatTok->WhitespaceRange = SourceRange(
WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
FormatTok->TokenText = StringRef(
@@ -1194,6 +1205,7 @@
Lexer &Lex;
SourceManager &SourceMgr;
IdentifierTable IdentTable;
+ utils::Encoding Encoding;
llvm::SpecificBumpPtrAllocator<FormatToken> Allocator;
SmallVector<FormatToken *, 16> Tokens;
@@ -1209,12 +1221,17 @@
Formatter(const FormatStyle &Style, Lexer &Lex, SourceManager &SourceMgr,
const std::vector<CharSourceRange> &Ranges)
: Style(Style), Lex(Lex), SourceMgr(SourceMgr),
- Whitespaces(SourceMgr, Style), Ranges(Ranges) {}
+ Whitespaces(SourceMgr, Style), Ranges(Ranges),
+ Encoding(utils::detectEncoding(Lex.getBuffer())) {
+ DEBUG(llvm::dbgs()
+ << "File encoding: "
+ << (Encoding == utils::Encoding_UTF8 ? "UTF8" : "unknown") << "\n");
+ }
virtual ~Formatter() {}
tooling::Replacements format() {
- FormatTokenLexer Tokens(Lex, SourceMgr);
+ FormatTokenLexer Tokens(Lex, SourceMgr, Encoding);
UnwrappedLineParser Parser(Style, Tokens.lex(), *this);
bool StructuralError = Parser.parse();
@@ -1290,7 +1307,7 @@
1;
}
UnwrappedLineFormatter Formatter(Style, SourceMgr, TheLine, Indent,
- TheLine.First, Whitespaces);
+ TheLine.First, Whitespaces, Encoding);
Formatter.format(I + 1 != E ? &*(I + 1) : NULL);
IndentForLevel[TheLine.Level] = LevelIndent;
PreviousLineWasTouched = true;
@@ -1616,6 +1633,8 @@
WhitespaceManager Whitespaces;
std::vector<CharSourceRange> Ranges;
std::vector<AnnotatedLine> AnnotatedLines;
+
+ utils::Encoding Encoding;
};
tooling::Replacements reformat(const FormatStyle &Style, Lexer &Lex,
Index: lib/Format/FormatToken.h
===================================================================
--- lib/Format/FormatToken.h
+++ lib/Format/FormatToken.h
@@ -61,11 +61,12 @@
struct FormatToken {
FormatToken()
: NewlinesBefore(0), HasUnescapedNewline(false), LastNewlineOffset(0),
- TokenLength(0), IsFirst(false), MustBreakBefore(false),
- Type(TT_Unknown), SpacesRequiredBefore(0), CanBreakBefore(false),
- ClosesTemplateDeclaration(false), ParameterCount(0), TotalLength(0),
- UnbreakableTailLength(0), BindingStrength(0), SplitPenalty(0),
- LongestObjCSelectorName(0), FakeRParens(0), LastInChainOfCalls(false),
+ TokenLength(0), CodePointCount(0), IsFirst(false),
+ MustBreakBefore(false), Type(TT_Unknown), SpacesRequiredBefore(0),
+ CanBreakBefore(false), ClosesTemplateDeclaration(false),
+ ParameterCount(0), TotalLength(0), UnbreakableTailLength(0),
+ BindingStrength(0), SplitPenalty(0), LongestObjCSelectorName(0),
+ FakeRParens(0), LastInChainOfCalls(false),
PartOfMultiVariableDeclStmt(false), MatchingParen(NULL), Previous(NULL),
Next(NULL) {}
@@ -94,6 +95,10 @@
/// with the token.
unsigned TokenLength;
+ /// \brief The length of the non-whitespace parts of the token in CodePoints.
+ /// We need this to correctly measure number of columns a token spans.
+ unsigned CodePointCount;
+
/// \brief Indicates that this is the first token.
bool IsFirst;
Index: lib/Format/Utils.h
===================================================================
--- /dev/null
+++ lib/Format/Utils.h
@@ -0,0 +1,104 @@
+//===--- Utils.h - Format C++ code ----------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief Contains various utility functions.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_FORMAT_UTILS_H
+#define LLVM_CLANG_FORMAT_UTILS_H
+
+#include "clang/Basic/LLVM.h"
+#include "llvm/Support/ConvertUTF.h"
+
+namespace clang {
+namespace format {
+namespace utils {
+
+enum Encoding {
+ Encoding_UTF8,
+ Encoding_Unknown
+};
+
+inline Encoding detectEncoding(StringRef Text) {
+ const UTF8 *Ptr = reinterpret_cast<const UTF8 *>(Text.begin());
+ const UTF8 *BufEnd = reinterpret_cast<const UTF8 *>(Text.end());
+ if (::isLegalUTF8String(&Ptr, BufEnd))
+ return Encoding_UTF8;
+ return Encoding_Unknown;
+}
+
+inline unsigned getCodePointCountUTF8(StringRef Text) {
+ unsigned CodePoints = 0;
+ for (size_t i = 0, e = Text.size(); i < e; i += getNumBytesForUTF8(Text[i])) {
+ ++CodePoints;
+ }
+ return CodePoints;
+}
+
+inline unsigned getCodePointCount(StringRef Text, Encoding Encoding) {
+ switch (Encoding) {
+ case Encoding_UTF8:
+ return getCodePointCountUTF8(Text);
+ default:
+ return Text.size();
+ }
+}
+
+inline unsigned getCodePointNumBytes(char FirstChar, Encoding Encoding) {
+ switch (Encoding) {
+ case Encoding_UTF8:
+ return getNumBytesForUTF8(FirstChar);
+ default:
+ return 1;
+ }
+}
+
+inline bool isOctDigit(char c) {
+ return '0' <= c && c <= '7';
+}
+
+inline bool isHexDigit(char c) {
+ return ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') ||
+ ('A' <= c && c <= 'F');
+}
+
+inline unsigned getEscapeSequenceLength(StringRef Text) {
+ assert(Text[0] == '\\');
+ if (Text.size() < 2)
+ return 1;
+
+ switch (Text[1]) {
+ case 'u':
+ return 6;
+ case 'U':
+ return 10;
+ case 'x': {
+ unsigned I = 2; // Point after '\x'.
+ while (I < Text.size() && isHexDigit(Text[I]))
+ ++I;
+ return I;
+ }
+ default:
+ if (isOctDigit(Text[1])) {
+ unsigned I = 1;
+ while (I < Text.size() && I < 4 && isOctDigit(Text[I]))
+ ++I;
+ return I;
+ }
+ return 2;
+ }
+}
+
+} // namespace utils
+} // namespace format
+} // namespace clang
+
+#endif // LLVM_CLANG_FORMAT_UTILS_H
Index: unittests/Format/FormatTest.cpp
===================================================================
--- unittests/Format/FormatTest.cpp
+++ unittests/Format/FormatTest.cpp
@@ -4873,5 +4873,80 @@
EXPECT_EQ(Style, ParsedStyle);
}
+TEST_F(FormatTest, WorksFor8bitEncodings) {
+ EXPECT_EQ("\"\xce\xe4\xed\xe0\xe6\xe4\xfb \xe2 \"\n"
+ "\"\xf1\xf2\xf3\xe4\xb8\xed\xf3\xfe \"\n"
+ "\"\xe7\xe8\xec\xed\xfe\xfe \"\n"
+ "\"\xef\xee\xf0\xf3...\"",
+ format("\"\xce\xe4\xed\xe0\xe6\xe4\xfb \xe2 "
+ "\xf1\xf2\xf3\xe4\xb8\xed\xf3\xfe \xe7\xe8\xec\xed\xfe\xfe "
+ "\xef\xee\xf0\xf3...\"",
+ getLLVMStyleWithColumns(12)));
+}
+
+TEST_F(FormatTest, CountsUTF8CharactersProperly) {
+ verifyFormat("\"Однажды в студёную зимнюю пору...\"",
+ getLLVMStyleWithColumns(35));
+ verifyFormat("\"一 二 三 四 五 六 七 八 九 十\"",
+ getLLVMStyleWithColumns(21));
+ verifyFormat("// Однажды в студёную зимнюю пору...",
+ getLLVMStyleWithColumns(36));
+ verifyFormat("// 一 二 三 四 五 六 七 八 九 十",
+ getLLVMStyleWithColumns(22));
+ verifyFormat("/* Однажды в студёную зимнюю пору... */",
+ getLLVMStyleWithColumns(39));
+ verifyFormat("/* 一 二 三 四 五 六 七 八 九 十 */",
+ getLLVMStyleWithColumns(25));
+}
+
+TEST_F(FormatTest, SplitsUTF8Strings) {
+ EXPECT_EQ(
+ "\"Однажды, в \"\n"
+ "\"студёную \"\n"
+ "\"зимнюю \"\n"
+ "\"пору,\"",
+ format("\"Однажды, в студёную зимнюю пору,\"",
+ getLLVMStyleWithColumns(13)));
+ EXPECT_EQ("\"一 二 三 四 \"\n"
+ "\"五 六 七 八 \"\n"
+ "\"九 十\"",
+ format("\"一 二 三 四 五 六 七 八 九 十\"",
+ getLLVMStyleWithColumns(10)));
+}
+
+TEST_F(FormatTest, SplitsUTF8LineComments) {
+ EXPECT_EQ("// Я из лесу\n"
+ "// вышел; был\n"
+ "// сильный\n"
+ "// мороз.",
+ format("// Я из лесу вышел; был сильный мороз.",
+ getLLVMStyleWithColumns(13)));
+ EXPECT_EQ("// 一二三\n"
+ "// 四五六七\n"
+ "// 八\n"
+ "// 九 十",
+ format("// 一二三 四五六七 八 九 十", getLLVMStyleWithColumns(6)));
+}
+
+TEST_F(FormatTest, SplitsUTF8BlockComments) {
+ EXPECT_EQ("/* Гляжу,\n"
+ " * поднимается\n"
+ " * медленно в\n"
+ " * гору\n"
+ " * Лошадка,\n"
+ " * везущая\n"
+ " * хворосту\n"
+ " * воз. */",
+ format("/* Гляжу, поднимается медленно в гору\n"
+ " * Лошадка, везущая хворосту воз. */",
+ getLLVMStyleWithColumns(13)));
+ EXPECT_EQ("/* 一二三\n"
+ " * 四五六七\n"
+ " * 八\n"
+ " * 九 十\n"
+ " */",
+ format("/* 一二三 四五六七 八 九 十 */", getLLVMStyleWithColumns(6)));
+}
+
} // end namespace tooling
} // end namespace clang
_______________________________________________
cfe-commits mailing list
[email protected]
http://lists.cs.uiuc.edu/mailman/listinfo/cfe-commits