DaanDeMeyer created this revision. DaanDeMeyer added a reviewer: sammccall. DaanDeMeyer added a project: clang-tools-extra. Herald added subscribers: cfe-commits, kadircet, arphaman, jkorous, ioeric, ilya-biryukov.
See http://lists.llvm.org/pipermail/clangd-dev/2018-October/000171.html for context. I kept the error (instead of downgrading to a log message) since the range lengths differing does indicate either a bug in the client or server range calculation or the buffers being out of sync (which both seems serious enough to me to be an error). If any existing clients aside from VSCode break they should only break when accidentally typing a Unicode character which should only be a minor nuisance for a little while until the bug is fixed in the respective client. Repository: rCTE Clang Tools Extra https://reviews.llvm.org/D53527 Files: clangd/DraftStore.cpp clangd/SourceCode.cpp clangd/SourceCode.h Index: clangd/SourceCode.cpp =================================================================== --- clangd/SourceCode.cpp +++ clangd/SourceCode.cpp @@ -67,13 +67,14 @@ return std::min(Result, U8.size()); } -// Counts the number of UTF-16 code units needed to represent a string. -// Like most strings in clangd, the input is UTF-8 encoded. -static size_t utf16Len(StringRef U8) { +// Counts the number of UTF-16 code units needed to represent a string (LSP +// specifies string lengths in UTF-16 code units). Like most strings in clangd, +// the input is UTF-8 encoded. +size_t lspLength(StringRef Code) { // A codepoint takes two UTF-16 code unit if it's astral (outside BMP). // Astral codepoints are encoded as 4 bytes in UTF-8, starting with 11110xxx. size_t Count = 0; - iterateCodepoints(U8, [&](int U8Len, int U16Len) { + iterateCodepoints(Code, [&](int U8Len, int U16Len) { Count += U16Len; return false; }); @@ -123,7 +124,7 @@ size_t StartOfLine = (PrevNL == StringRef::npos) ? 0 : (PrevNL + 1); Position Pos; Pos.line = Lines; - Pos.character = utf16Len(Before.substr(StartOfLine)); + Pos.character = lspLength(Before.substr(StartOfLine)); return Pos; } @@ -139,7 +140,7 @@ if (!Invalid) { auto ColumnInBytes = SM.getColumnNumber(FID, Offset) - 1; auto LineSoFar = Code.substr(Offset - ColumnInBytes, ColumnInBytes); - P.character = utf16Len(LineSoFar); + P.character = lspLength(LineSoFar); } return P; } Index: clangd/SourceCode.h =================================================================== --- clangd/SourceCode.h +++ clangd/SourceCode.h @@ -23,6 +23,9 @@ namespace clangd { +/// Calculate the length of Code according to the LSP specification. +size_t lspLength(StringRef Code); + /// Turn a [line, column] pair into an offset in Code. /// /// If P.character exceeds the line length, returns the offset at end-of-line. Index: clangd/DraftStore.cpp =================================================================== --- clangd/DraftStore.cpp +++ clangd/DraftStore.cpp @@ -77,8 +77,21 @@ End, Start), llvm::errc::invalid_argument); - if (Change.rangeLength && - (ssize_t)(*EndIndex - *StartIndex) != *Change.rangeLength) + // Since the range length between two LSP positions is dependent on the + // contents of the buffer we compute the range length between the start and + // end position ourselves and compare it to the range length of the LSP + // message to verify the buffers of the client and server are in sync. + + // The difference between EndIndex and StartIndex gives the range length in + // bytes. However, the LSP range length is specified in UTF-16 code units + // which means directly comparing them will give faulty results. To solve + // this we convert the computed range length in bytes back to UTF-16 code + // units by counting the amount of UTF-16 code units in the [StartIndex, + // EndIndex) substring of the document buffer. + ssize_t ComputedRangeLength = + lspLength(Contents.substr(*StartIndex, *EndIndex - *StartIndex)); + + if (Change.rangeLength && ComputedRangeLength != *Change.rangeLength) return make_error<StringError>( formatv("Change's rangeLength ({0}) doesn't match the " "computed range length ({1}).",
Index: clangd/SourceCode.cpp =================================================================== --- clangd/SourceCode.cpp +++ clangd/SourceCode.cpp @@ -67,13 +67,14 @@ return std::min(Result, U8.size()); } -// Counts the number of UTF-16 code units needed to represent a string. -// Like most strings in clangd, the input is UTF-8 encoded. -static size_t utf16Len(StringRef U8) { +// Counts the number of UTF-16 code units needed to represent a string (LSP +// specifies string lengths in UTF-16 code units). Like most strings in clangd, +// the input is UTF-8 encoded. +size_t lspLength(StringRef Code) { // A codepoint takes two UTF-16 code unit if it's astral (outside BMP). // Astral codepoints are encoded as 4 bytes in UTF-8, starting with 11110xxx. size_t Count = 0; - iterateCodepoints(U8, [&](int U8Len, int U16Len) { + iterateCodepoints(Code, [&](int U8Len, int U16Len) { Count += U16Len; return false; }); @@ -123,7 +124,7 @@ size_t StartOfLine = (PrevNL == StringRef::npos) ? 0 : (PrevNL + 1); Position Pos; Pos.line = Lines; - Pos.character = utf16Len(Before.substr(StartOfLine)); + Pos.character = lspLength(Before.substr(StartOfLine)); return Pos; } @@ -139,7 +140,7 @@ if (!Invalid) { auto ColumnInBytes = SM.getColumnNumber(FID, Offset) - 1; auto LineSoFar = Code.substr(Offset - ColumnInBytes, ColumnInBytes); - P.character = utf16Len(LineSoFar); + P.character = lspLength(LineSoFar); } return P; } Index: clangd/SourceCode.h =================================================================== --- clangd/SourceCode.h +++ clangd/SourceCode.h @@ -23,6 +23,9 @@ namespace clangd { +/// Calculate the length of Code according to the LSP specification. +size_t lspLength(StringRef Code); + /// Turn a [line, column] pair into an offset in Code. /// /// If P.character exceeds the line length, returns the offset at end-of-line. Index: clangd/DraftStore.cpp =================================================================== --- clangd/DraftStore.cpp +++ clangd/DraftStore.cpp @@ -77,8 +77,21 @@ End, Start), llvm::errc::invalid_argument); - if (Change.rangeLength && - (ssize_t)(*EndIndex - *StartIndex) != *Change.rangeLength) + // Since the range length between two LSP positions is dependent on the + // contents of the buffer we compute the range length between the start and + // end position ourselves and compare it to the range length of the LSP + // message to verify the buffers of the client and server are in sync. + + // The difference between EndIndex and StartIndex gives the range length in + // bytes. However, the LSP range length is specified in UTF-16 code units + // which means directly comparing them will give faulty results. To solve + // this we convert the computed range length in bytes back to UTF-16 code + // units by counting the amount of UTF-16 code units in the [StartIndex, + // EndIndex) substring of the document buffer. + ssize_t ComputedRangeLength = + lspLength(Contents.substr(*StartIndex, *EndIndex - *StartIndex)); + + if (Change.rangeLength && ComputedRangeLength != *Change.rangeLength) return make_error<StringError>( formatv("Change's rangeLength ({0}) doesn't match the " "computed range length ({1}).",
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits