[clang] [Clang][Lexer][Performance] Optimize Lexer whitespace skipping logic (PR #180819)

Thibault Monnier via cfe-commits Mon, 23 Feb 2026 04:35:34 -0800

https://github.com/Thibault-Monnier updated 
https://github.com/llvm/llvm-project/pull/180819


>From faa899a6ce518c1176f2bf59f199eb42e59d840e Mon Sep 17 00:00:00 2001
From: Thibault-Monnier <[email protected]>
Date: Tue, 10 Feb 2026 19:41:47 +0100
Subject: [PATCH 1/3] Try prioritizing skipping space

---
 clang/lib/Lex/Lexer.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp
index 1498657047bd6..483cca32e08a2 100644
--- a/clang/lib/Lex/Lexer.cpp
+++ b/clang/lib/Lex/Lexer.cpp
@@ -2533,8 +2533,8 @@ bool Lexer::SkipWhitespace(Token &Result, const char 
*CurPtr,
 
   // Skip consecutive spaces efficiently.
   while (true) {
-    // Skip horizontal whitespace very aggressively.
-    while (isHorizontalWhitespace(Char))
+    // Skip horizontal whitespace, especially space, very aggressively.
+    while (LLVM_LIKELY(Char == ' ') || isHorizontalWhitespace(Char))
       Char = *++CurPtr;
 
     // Otherwise if we have something other than whitespace, we're done.
@@ -3756,10 +3756,10 @@ bool Lexer::LexTokenInternal(Token &Result, bool 
TokAtPhysicalStartOfLine) {
   const char *CurPtr = BufferPtr;
 
   // Small amounts of horizontal whitespace is very common between tokens.
-  if (isHorizontalWhitespace(*CurPtr)) {
+  if (LLVM_LIKELY(*CurPtr == ' ') || isHorizontalWhitespace(*CurPtr)) {
     do {
       ++CurPtr;
-    } while (isHorizontalWhitespace(*CurPtr));
+    } while (LLVM_LIKELY(*CurPtr == ' ') || isHorizontalWhitespace(*CurPtr));
 
     // If we are keeping whitespace and other tokens, just return what we just
     // skipped.  The next lexer invocation will return the token after the

>From 0ddd945fb9fbde93a49747d5ba2e24d39425a752 Mon Sep 17 00:00:00 2001
From: Thibault-Monnier <[email protected]>
Date: Mon, 23 Feb 2026 10:40:32 +0100
Subject: [PATCH 2/3] Try remove LLVM_LIKELY

---
 clang/lib/Lex/Lexer.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp
index 483cca32e08a2..446a8a6eb7f63 100644
--- a/clang/lib/Lex/Lexer.cpp
+++ b/clang/lib/Lex/Lexer.cpp
@@ -2534,7 +2534,7 @@ bool Lexer::SkipWhitespace(Token &Result, const char 
*CurPtr,
   // Skip consecutive spaces efficiently.
   while (true) {
     // Skip horizontal whitespace, especially space, very aggressively.
-    while (LLVM_LIKELY(Char == ' ') || isHorizontalWhitespace(Char))
+    while (Char == ' ' || isHorizontalWhitespace(Char))
       Char = *++CurPtr;
 
     // Otherwise if we have something other than whitespace, we're done.
@@ -3756,10 +3756,12 @@ bool Lexer::LexTokenInternal(Token &Result, bool 
TokAtPhysicalStartOfLine) {
   const char *CurPtr = BufferPtr;
 
   // Small amounts of horizontal whitespace is very common between tokens.
-  if (LLVM_LIKELY(*CurPtr == ' ') || isHorizontalWhitespace(*CurPtr)) {
+  // Check for space character separately to skip the expensive
+  // isHorizontalWhitespace() check
+  if (*CurPtr == ' ' || isHorizontalWhitespace(*CurPtr)) {
     do {
       ++CurPtr;
-    } while (LLVM_LIKELY(*CurPtr == ' ') || isHorizontalWhitespace(*CurPtr));
+    } while (*CurPtr == ' ' || isHorizontalWhitespace(*CurPtr));
 
     // If we are keeping whitespace and other tokens, just return what we just
     // skipped.  The next lexer invocation will return the token after the

>From 62e29012c0ade20d916b9c0d8e111b60758d5326 Mon Sep 17 00:00:00 2001
From: Thibault-Monnier <[email protected]>
Date: Mon, 23 Feb 2026 13:35:07 +0100
Subject: [PATCH 3/3] Try changing isHorizontalWhitespace directly

---
 clang/include/clang/Basic/CharInfo.h | 2 +-
 clang/lib/Lex/Lexer.cpp              | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/clang/include/clang/Basic/CharInfo.h 
b/clang/include/clang/Basic/CharInfo.h
index 87626eeb8a700..1ede0328952d8 100644
--- a/clang/include/clang/Basic/CharInfo.h
+++ b/clang/include/clang/Basic/CharInfo.h
@@ -90,7 +90,7 @@ LLVM_READONLY inline bool isAsciiIdentifierContinue(unsigned 
char c,
 /// Note that this returns false for '\\0'.
 LLVM_READONLY inline bool isHorizontalWhitespace(unsigned char c) {
   using namespace charinfo;
-  return (InfoTable[c] & (CHAR_HORZ_WS|CHAR_SPACE)) != 0;
+  return c == ' ' || (InfoTable[c] & (CHAR_HORZ_WS|CHAR_SPACE)) != 0;
 }
 
 /// Returns true if this character is vertical ASCII whitespace: '\\n', '\\r'.
diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp
index 446a8a6eb7f63..4dc2eebdf0e97 100644
--- a/clang/lib/Lex/Lexer.cpp
+++ b/clang/lib/Lex/Lexer.cpp
@@ -2534,7 +2534,7 @@ bool Lexer::SkipWhitespace(Token &Result, const char 
*CurPtr,
   // Skip consecutive spaces efficiently.
   while (true) {
     // Skip horizontal whitespace, especially space, very aggressively.
-    while (Char == ' ' || isHorizontalWhitespace(Char))
+    while (isHorizontalWhitespace(Char))
       Char = *++CurPtr;
 
     // Otherwise if we have something other than whitespace, we're done.
@@ -3758,10 +3758,10 @@ bool Lexer::LexTokenInternal(Token &Result, bool 
TokAtPhysicalStartOfLine) {
   // Small amounts of horizontal whitespace is very common between tokens.
   // Check for space character separately to skip the expensive
   // isHorizontalWhitespace() check
-  if (*CurPtr == ' ' || isHorizontalWhitespace(*CurPtr)) {
+  if (isHorizontalWhitespace(*CurPtr)) {
     do {
       ++CurPtr;
-    } while (*CurPtr == ' ' || isHorizontalWhitespace(*CurPtr));
+    } while (isHorizontalWhitespace(*CurPtr));
 
     // If we are keeping whitespace and other tokens, just return what we just
     // skipped.  The next lexer invocation will return the token after the

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [Clang][Lexer][Performance] Optimize Lexer whitespace skipping logic (PR #180819)

Reply via email to