Re: [PATCH] Calculate and store ColumnWidth instead of CodePointCount in FormatTokens.

Alexander Kornienko Fri, 06 Sep 2013 16:05:41 -0700

  Calculate OriginalColumn and LastLineColumnWidth for all tokens. Removed 
FirstLineColumnWidth and use ColumnWidth for this purpose for multiline tokens.


Hi klimek, djasper,

http://llvm-reviews.chandlerc.com/D1608

CHANGE SINCE LAST DIFF
  http://llvm-reviews.chandlerc.com/D1608?vs=4096&id=4112#toc

Files:
  lib/Format/BreakableToken.cpp
  lib/Format/ContinuationIndenter.cpp
  lib/Format/Format.cpp
  lib/Format/FormatToken.cpp
  lib/Format/FormatToken.h
  lib/Format/TokenAnnotator.cpp
  unittests/Format/FormatTest.cpp

Index: lib/Format/BreakableToken.cpp
===================================================================
--- lib/Format/BreakableToken.cpp
+++ lib/Format/BreakableToken.cpp
@@ -337,8 +337,10 @@
   // if leading tabs are intermixed with spaces, that is not a high priority.
 
   // Adjust the start column uniformly accross all lines.
-  StartOfLineColumn[LineIndex] =
-      std::max<int>(0, Whitespace.size() + IndentDelta);
+  StartOfLineColumn[LineIndex] = std::max<int>(
+      0,
+      encoding::columnWidthWithTabs(Whitespace, 0, Style.TabWidth, Encoding) +
+          IndentDelta);
 }
 
 unsigned BreakableBlockComment::getLineCount() const { return Lines.size(); }
Index: lib/Format/ContinuationIndenter.cpp
===================================================================
--- lib/Format/ContinuationIndenter.cpp
+++ lib/Format/ContinuationIndenter.cpp
@@ -201,7 +201,7 @@
                                State.NextToken->WhitespaceRange.getEnd()) -
                            SourceMgr.getSpellingColumnNumber(
                                State.NextToken->WhitespaceRange.getBegin());
-    State.Column += WhitespaceLength + State.NextToken->CodePointCount;
+    State.Column += WhitespaceLength + State.NextToken->ColumnWidth;
     State.NextToken = State.NextToken->Next;
     return 0;
   }
@@ -252,11 +252,11 @@
                  State.Line->StartsDefinition))) {
       State.Column = State.Stack.back().Indent;
     } else if (Current.Type == TT_ObjCSelectorName) {
-      if (State.Stack.back().ColonPos > Current.CodePointCount) {
-        State.Column = State.Stack.back().ColonPos - Current.CodePointCount;
+      if (State.Stack.back().ColonPos > Current.ColumnWidth) {
+        State.Column = State.Stack.back().ColonPos - Current.ColumnWidth;
       } else {
         State.Column = State.Stack.back().Indent;
-        State.Stack.back().ColonPos = State.Column + Current.CodePointCount;
+        State.Stack.back().ColonPos = State.Column + Current.ColumnWidth;
       }
     } else if (Current.is(tok::l_square) && Current.Type != TT_ObjCMethodExpr &&
                Current.Type != TT_LambdaLSquare) {
@@ -302,7 +302,7 @@
     if (!Current.isTrailingComment())
       State.Stack.back().LastSpace = State.Column;
     if (Current.isMemberAccess())
-      State.Stack.back().LastSpace += Current.CodePointCount;
+      State.Stack.back().LastSpace += Current.ColumnWidth;
     State.StartOfLineLevel = State.ParenLevel;
     State.LowestLevelOnLine = State.ParenLevel;
 
@@ -338,8 +338,8 @@
       State.Stack.back().VariablePos = State.Column;
       // Move over * and & if they are bound to the variable name.
       const FormatToken *Tok = &Previous;
-      while (Tok && State.Stack.back().VariablePos >= Tok->CodePointCount) {
-        State.Stack.back().VariablePos -= Tok->CodePointCount;
+      while (Tok && State.Stack.back().VariablePos >= Tok->ColumnWidth) {
+        State.Stack.back().VariablePos -= Tok->ColumnWidth;
         if (Tok->SpacesRequiredBefore != 0)
           break;
         Tok = Tok->Previous;
@@ -356,12 +356,12 @@
     if (Current.Type == TT_ObjCSelectorName &&
         State.Stack.back().ColonPos == 0) {
       if (State.Stack.back().Indent + Current.LongestObjCSelectorName >
-          State.Column + Spaces + Current.CodePointCount)
+          State.Column + Spaces + Current.ColumnWidth)
         State.Stack.back().ColonPos =
             State.Stack.back().Indent + Current.LongestObjCSelectorName;
       else
         State.Stack.back().ColonPos =
-            State.Column + Spaces + Current.CodePointCount;
+            State.Column + Spaces + Current.ColumnWidth;
     }
 
     if (Previous.opensScope() && Previous.Type != TT_ObjCMethodExpr &&
@@ -431,7 +431,7 @@
         std::min(State.LowestLevelOnLine, State.ParenLevel);
   if (Current.isMemberAccess())
     State.Stack.back().StartOfFunctionCall =
-        Current.LastInChainOfCalls ? 0 : State.Column + Current.CodePointCount;
+        Current.LastInChainOfCalls ? 0 : State.Column + Current.ColumnWidth;
   if (Current.Type == TT_CtorInitializerColon) {
     // Indent 2 from the column, so:
     // SomeClass::SomeClass()
@@ -587,7 +587,7 @@
     State.StartOfStringLiteral = 0;
   }
 
-  State.Column += Current.CodePointCount;
+  State.Column += Current.ColumnWidth;
   State.NextToken = State.NextToken->Next;
   unsigned Penalty = breakProtrudingToken(Current, State, DryRun);
   if (State.Column > getColumnLimit(State)) {
@@ -613,8 +613,7 @@
   for (unsigned i = 0, e = State.Stack.size(); i != e; ++i)
     State.Stack[i].BreakBeforeParameter = true;
 
-  unsigned ColumnsUsed =
-      State.Column - Current.CodePointCount + Current.FirstLineColumnWidth;
+  unsigned ColumnsUsed = State.Column;
   // We can only affect layout of the first and the last line, so the penalty
   // for all other lines is constant, and we ignore it.
   State.Column = Current.LastLineColumnWidth;
@@ -631,7 +630,7 @@
     return 0;
 
   llvm::OwningPtr<BreakableToken> Token;
-  unsigned StartColumn = State.Column - Current.CodePointCount;
+  unsigned StartColumn = State.Column - Current.ColumnWidth;
 
   if (Current.is(tok::string_literal) &&
       Current.Type != TT_ImplicitStringLiteral) {
@@ -652,11 +651,8 @@
     Token.reset(new BreakableStringLiteral(
         Current, StartColumn, State.Line->InPPDirective, Encoding, Style));
   } else if (Current.Type == TT_BlockComment && Current.isTrailingComment()) {
-    unsigned OriginalStartColumn =
-        SourceMgr.getSpellingColumnNumber(Current.getStartOfNonWhitespace()) -
-        1;
     Token.reset(new BreakableBlockComment(
-        Current, StartColumn, OriginalStartColumn, !Current.Previous,
+        Current, StartColumn, Current.OriginalColumn, !Current.Previous,
         State.Line->InPPDirective, Encoding, Style));
   } else if (Current.Type == TT_LineComment &&
              (Current.Previous == NULL ||
@@ -668,10 +664,8 @@
     // FIXME: If we want to handle them correctly, we'll need to adjust
     // leading whitespace in consecutive lines when changing indentation of
     // the first line similar to what we do with block comments.
-    if (Current.isMultiline()) {
-      State.Column = StartColumn + Current.FirstLineColumnWidth;
+    if (Current.isMultiline())
       return 0;
-    }
 
     Token.reset(new BreakableLineComment(
         Current, StartColumn, State.Line->InPPDirective, Encoding, Style));
@@ -759,7 +753,7 @@
   if (Current.getNextNonComment() &&
       Current.getNextNonComment()->is(tok::string_literal))
     return true; // Implicit concatenation.
-  if (State.Column + Current.CodePointCount + Current.UnbreakableTailLength >
+  if (State.Column + Current.ColumnWidth + Current.UnbreakableTailLength >
       Style.ColumnLimit)
     return true; // String will be split.
   return false;
Index: lib/Format/Format.cpp
===================================================================
--- lib/Format/Format.cpp
+++ lib/Format/Format.cpp
@@ -603,7 +603,7 @@
       FormatTok->WhitespaceRange =
           SourceRange(GreaterLocation, GreaterLocation);
       FormatTok->TokenText = ">";
-      FormatTok->CodePointCount = 1;
+      FormatTok->ColumnWidth = 1;
       GreaterStashed = false;
       return FormatTok;
     }
@@ -677,25 +677,28 @@
       GreaterStashed = true;
     }
 
-    // Now FormatTok is the next non-whitespace token.
-    FormatTok->CodePointCount =
-        encoding::getCodePointCount(FormatTok->TokenText, Encoding);
-
-    if (FormatTok->isOneOf(tok::string_literal, tok::comment)) {
-      StringRef Text = FormatTok->TokenText;
-      size_t FirstNewlinePos = Text.find('\n');
-      if (FirstNewlinePos != StringRef::npos) {
-        // FIXME: Handle embedded tabs.
-        FormatTok->FirstLineColumnWidth = encoding::columnWidthWithTabs(
-            Text.substr(0, FirstNewlinePos), 0, Style.TabWidth, Encoding);
-        FormatTok->LastLineColumnWidth = encoding::columnWidthWithTabs(
-            Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth,
-            Encoding);
-      }
-    }
-    // FIXME: Add the CodePointCount to Column.
     FormatTok->WhitespaceRange = SourceRange(
         WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
+    // Now FormatTok is the next non-whitespace token.
+
+    StringRef Text = FormatTok->TokenText;
+    size_t FirstNewlinePos = Text.find('\n');
+    if (FirstNewlinePos != StringRef::npos) {
+      // LastLineColumnWidth is always correct, as it always starts in the
+      // beginning of the line.
+      FormatTok->LastLineColumnWidth = encoding::columnWidthWithTabs(
+          Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth,
+          Encoding);
+      Text = Text.substr(0, FirstNewlinePos);
+    }
+
+    // FIXME: ColumnWidth actually depends on the start column, we need to take
+    // this to account when the token is moved.
+    FormatTok->ColumnWidth =
+        encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding);
+
+    Column += FormatTok->ColumnWidth;
+
     return FormatTok;
   }
 
@@ -955,6 +958,14 @@
     if (Indent > Style.ColumnLimit)
       return;
 
+    // Never try to merge lines with multiline tokens.
+    for (SmallVectorImpl<AnnotatedLine *>::iterator It = I; It != E; ++It) {
+      for (const FormatToken *Tok = (*It)->First; Tok; Tok = Tok->Next) {
+        if (Tok->isMultiline())
+          return;
+      }
+    }
+
     unsigned Limit = Style.ColumnLimit - Indent;
     // If we already exceed the column limit, we set 'Limit' to 0. The different
     // tryMerge..() functions can then decide whether to still do merging.
Index: lib/Format/FormatToken.cpp
===================================================================
--- lib/Format/FormatToken.cpp
+++ lib/Format/FormatToken.cpp
@@ -42,7 +42,7 @@
   // Calculate the number of code points we have to format this list. As the
   // first token is already placed, we have to subtract it.
   unsigned RemainingCodePoints = Style.ColumnLimit - State.Column +
-                                 State.NextToken->Previous->CodePointCount;
+                                 State.NextToken->Previous->ColumnWidth;
 
   // Find the best ColumnFormat, i.e. the best number of columns to use.
   const ColumnFormat *Format = getColumnFormat(RemainingCodePoints);
@@ -82,7 +82,7 @@
 static unsigned CodePointsBetween(const FormatToken *Begin,
                                   const FormatToken *End) {
   assert(End->TotalLength >= Begin->TotalLength);
-  return End->TotalLength - Begin->TotalLength + Begin->CodePointCount;
+  return End->TotalLength - Begin->TotalLength + Begin->ColumnWidth;
 }
 
 void CommaSeparatedList::precomputeFormattingInfos(const FormatToken *Token) {
Index: lib/Format/FormatToken.h
===================================================================
--- lib/Format/FormatToken.h
+++ lib/Format/FormatToken.h
@@ -83,8 +83,8 @@
 struct FormatToken {
   FormatToken()
       : NewlinesBefore(0), HasUnescapedNewline(false), LastNewlineOffset(0),
-        CodePointCount(0), FirstLineColumnWidth(0), LastLineColumnWidth(0),
-        IsFirst(false), MustBreakBefore(false), IsUnterminatedLiteral(false),
+        ColumnWidth(0), LastLineColumnWidth(NoNewline), IsFirst(false),
+        MustBreakBefore(false), IsUnterminatedLiteral(false),
         BlockKind(BK_Unknown), Type(TT_Unknown), SpacesRequiredBefore(0),
         CanBreakBefore(false), ClosesTemplateDeclaration(false),
         ParameterCount(0), PackingKind(PPK_Inconclusive), TotalLength(0),
@@ -114,22 +114,22 @@
   /// whitespace (relative to \c WhiteSpaceStart). 0 if there is no '\n'.
   unsigned LastNewlineOffset;
 
-  /// \brief The length of the non-whitespace parts of the token in CodePoints.
+  /// \brief The width of the non-whitespace parts of the token (or its first
+  /// line for multi-line tokens) in columns.
   /// We need this to correctly measure number of columns a token spans.
-  unsigned CodePointCount;
+  unsigned ColumnWidth;
 
-  /// \brief Contains the number of code points in the first line of a
-  /// multi-line string literal or comment. Zero if there's no newline in the
-  /// token.
-  unsigned FirstLineColumnWidth;
+  /// \brief When assigned to the \c LastLineColumnWidth, indicates that there's
+  /// no newline in the token.
+  enum { NoNewline = UINT_MAX };
 
-  /// \brief Contains the number of code points in the last line of a
-  /// multi-line string literal or comment. Can be zero for line comments.
+  /// \brief Contains the width in columns of the last line of a multi-line
+  /// token. NoNewline if there's no newline in the token.
   unsigned LastLineColumnWidth;
 
   /// \brief Returns \c true if the token text contains newlines (escaped or
   /// not).
-  bool isMultiline() const { return FirstLineColumnWidth != 0; }
+  bool isMultiline() const { return LastLineColumnWidth != NoNewline; }
 
   /// \brief Indicates that this is the first token.
   bool IsFirst;
@@ -189,12 +189,12 @@
   /// token.
   unsigned TotalLength;
 
-  /// \brief The original column of this token, including expanded tabs.
-  /// The configured IndentWidth is used as tab width. Only tabs in whitespace
-  /// are expanded.
-  /// FIXME: This is currently only used on the first token of an unwrapped
-  /// line, and the implementation is not correct for other tokens (see the
-  /// FIXMEs in FormatTokenLexer::getNextToken()).
+  /// \brief The original 0-based column of this token, including expanded tabs.
+  /// The configured TabWidth is used as tab width.
+  ///
+  /// FIXME: This is implemented correctly only for block comments and for the
+  /// first token of an unwrapped line (see the FIXMEs in
+  /// FormatTokenLexer::getNextToken()).
   unsigned OriginalColumn;
 
   /// \brief The length of following tokens until the next natural split point,
Index: lib/Format/TokenAnnotator.cpp
===================================================================
--- lib/Format/TokenAnnotator.cpp
+++ lib/Format/TokenAnnotator.cpp
@@ -326,10 +326,10 @@
                  Line.First->Type == TT_ObjCMethodSpecifier) {
         Tok->Type = TT_ObjCMethodExpr;
         Tok->Previous->Type = TT_ObjCSelectorName;
-        if (Tok->Previous->CodePointCount >
+        if (Tok->Previous->ColumnWidth >
             Contexts.back().LongestObjCSelectorName) {
           Contexts.back().LongestObjCSelectorName =
-              Tok->Previous->CodePointCount;
+              Tok->Previous->ColumnWidth;
         }
         if (Contexts.back().FirstObjCSelectorName == NULL)
           Contexts.back().FirstObjCSelectorName = Tok->Previous;
@@ -1022,7 +1022,7 @@
 }
 
 void TokenAnnotator::calculateFormattingInformation(AnnotatedLine &Line) {
-  Line.First->TotalLength = Line.First->CodePointCount;
+  Line.First->TotalLength = Line.First->ColumnWidth;
   if (!Line.First->Next)
     return;
   FormatToken *Current = Line.First->Next;
@@ -1059,7 +1059,7 @@
       Current->TotalLength = Current->Previous->TotalLength + Style.ColumnLimit;
     else
       Current->TotalLength = Current->Previous->TotalLength +
-                             Current->CodePointCount +
+                             Current->ColumnWidth +
                              Current->SpacesRequiredBefore;
     // FIXME: Only calculate this if CanBreakBefore is true once static
     // initializers etc. are sorted out.
@@ -1095,7 +1095,7 @@
       UnbreakableTailLength = 0;
     } else {
       UnbreakableTailLength +=
-          Current->CodePointCount + Current->SpacesRequiredBefore;
+          Current->ColumnWidth + Current->SpacesRequiredBefore;
     }
     Current = Current->Previous;
   }
Index: unittests/Format/FormatTest.cpp
===================================================================
--- unittests/Format/FormatTest.cpp
+++ unittests/Format/FormatTest.cpp
@@ -5717,33 +5717,48 @@
   // FIXME: To correctly count mixed whitespace we need to
   // also correctly count mixed whitespace in front of the comment.
 
-  // Tab.TabWidth = 8;
-  // Tab.IndentWidth = 8;
-  // EXPECT_EQ("/*\n"
-  //           "\t      a\t\tcomment\n"
-  //           "\t      in multiple lines\n"
-  //           "       */",
-  //           format("   /*\t \t \n"
-  //                  " \t \t a\t\tcomment\t \t\n"
-  //                  " \t \t in multiple lines\t\n"
-  //                  " \t  */",
-  //                  Tab));
-  // Tab.UseTab = false;
-  // EXPECT_EQ("/*\n"
-  //           "              a\t\tcomment\n"
-  //           "              in multiple lines\n"
-  //           "       */",
-  //           format("   /*\t \t \n"
-  //                  " \t \t a\t\tcomment\t \t\n"
-  //                  " \t \t in multiple lines\t\n"
-  //                  " \t  */",
-  //                  Tab));
-  // EXPECT_EQ("/* some\n"
-  //           "   comment */",
-  //          format(" \t \t /* some\n"
-  //                 " \t \t    comment */",
-  //                 Tab));
-
+  Tab.TabWidth = 8;
+  Tab.IndentWidth = 8;
+  EXPECT_EQ("/*\n"
+            "\t      a\t\tcomment\n"
+            "\t      in multiple lines\n"
+            "       */",
+            format("   /*\t \t \n"
+                   " \t \t a\t\tcomment\t \t\n"
+                   " \t \t in multiple lines\t\n"
+                   " \t  */",
+                   Tab));
+  Tab.UseTab = false;
+  EXPECT_EQ("/*\n"
+            "              a\t\tcomment\n"
+            "              in multiple lines\n"
+            "       */",
+            format("   /*\t \t \n"
+                   " \t \t a\t\tcomment\t \t\n"
+                   " \t \t in multiple lines\t\n"
+                   " \t  */",
+                   Tab));
+  EXPECT_EQ("/* some\n"
+            "   comment */",
+           format(" \t \t /* some\n"
+                  " \t \t    comment */",
+                  Tab));
+  EXPECT_EQ("int a; /* some\n"
+            "   comment */",
+           format(" \t \t int a; /* some\n"
+                  " \t \t    comment */",
+                  Tab));
+
+  EXPECT_EQ("int a; /* some\n"
+            "comment */",
+           format(" \t \t int\ta; /* some\n"
+                  " \t \t    comment */",
+                  Tab));
+  EXPECT_EQ("f(\"\t\t\"); /* some\n"
+            "    comment */",
+           format(" \t \t f(\"\t\t\"); /* some\n"
+                  " \t \t    comment */",
+                  Tab));
   EXPECT_EQ("{\n"
             "  /*\n"
             "   * Comment\n"

_______________________________________________
cfe-commits mailing list
[email protected]
http://lists.cs.uiuc.edu/mailman/listinfo/cfe-commits

Re: [PATCH] Calculate and store ColumnWidth instead of CodePointCount in FormatTokens.

Reply via email to