bruno created this revision.
bruno added a reviewer: rsmith.
bruno added subscribers: cfe-commits, dexonsmith.

Improve invalid format string specifier handling by printing out invalid 
specifiers characters with \x, \u and \U. Previously clang would print gargabe 
whenever the character is unprintable.
    
Example, before:
  NSLog(@"%\u25B9", 3); => warning: invalid conversion specifier ' 
[-Wformat-invalid-specifier]
after:
  NSLog(@"%\u25B9", 3); => warning: invalid conversion specifier '\u25b9' 
[-Wformat-invalid-specifier]

http://reviews.llvm.org/D18296

Files:
  lib/Analysis/PrintfFormatString.cpp
  lib/Sema/SemaChecking.cpp
  test/SemaObjC/format-strings-objc.m

Index: test/SemaObjC/format-strings-objc.m
===================================================================
--- test/SemaObjC/format-strings-objc.m
+++ test/SemaObjC/format-strings-objc.m
@@ -265,3 +265,11 @@
   NSLog(@"%2$[tt]@ %1$[tt]s", @"Foo", @"Bar"); // expected-warning {{object format flags cannot be used with 's' conversion specifier}}
 }
 
+// Test Objective-C invalid no printable specifiers
+void testObjcInvalidNoPrintable() {
+  NSLog(@"%\u25B9", 3); // expected-warning {{invalid conversion specifier '\u25b9'}}
+  NSLog(@"%\xE2\x96\xB9", 3); // expected-warning {{invalid conversion specifier '\u25b9'}}
+  NSLog(@"%\U00010348", 42); // expected-warning {{invalid conversion specifier '\U00010348'}}
+  NSLog(@"%\xF0\x90\x8D\x88", 42); // expected-warning {{invalid conversion specifier '\U00010348'}}
+  NSLog(@"%\xe2", @"Foo"); // expected-warning {{input conversion stopped}} expected-warning {{invalid conversion specifier '\xe2'}}
+}
Index: lib/Sema/SemaChecking.cpp
===================================================================
--- lib/Sema/SemaChecking.cpp
+++ lib/Sema/SemaChecking.cpp
@@ -36,6 +36,8 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/Locale.h"
 #include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/raw_ostream.h"
 #include <limits>
@@ -3976,12 +3978,41 @@
     // gibberish when trying to match arguments.
     keepGoing = false;
   }
-  
-  EmitFormatDiagnostic(S.PDiag(diag::warn_format_invalid_conversion)
-                         << StringRef(csStart, csLen),
-                       Loc, /*IsStringLocation*/true,
-                       getSpecifierRange(startSpec, specifierLen));
-  
+
+  StringRef Specifier(csStart, csLen);
+
+  // If the specifier in non-printable, it could be the first byte of a UTF-8
+  // sequence. In that case, print the UTF-8 code point. If not, print the byte
+  // hex value.
+  std::string CodePointStr;
+  if (!llvm::sys::locale::isPrint(*csStart)) {
+    UTF32 CodePoint;
+    const UTF8 **B = reinterpret_cast<const UTF8 **>(&csStart);
+    const UTF8 *E =
+        reinterpret_cast<const UTF8 *>(csStart + specifierLen - csLen);
+    ConversionResult Result =
+        llvm::convertUTF8Sequence(B, E, &CodePoint, strictConversion);
+
+    if (Result != conversionOK) {
+      unsigned char FirstChar = *csStart;
+      CodePoint = (UTF32)FirstChar;
+    }
+
+    llvm::raw_string_ostream OS(CodePointStr);
+    if (CodePoint < 256)
+      OS << "\\x" << llvm::format("%02x", CodePoint);
+    else if (CodePoint <= 0xFFFF)
+      OS << "\\u" << llvm::format("%04x", CodePoint);
+    else
+      OS << "\\U" << llvm::format("%08x", CodePoint);
+    OS.flush();
+    Specifier = CodePointStr;
+  }
+
+  EmitFormatDiagnostic(
+      S.PDiag(diag::warn_format_invalid_conversion) << Specifier, Loc,
+      /*IsStringLocation*/ true, getSpecifierRange(startSpec, specifierLen));
+
   return keepGoing;
 }
 
Index: lib/Analysis/PrintfFormatString.cpp
===================================================================
--- lib/Analysis/PrintfFormatString.cpp
+++ lib/Analysis/PrintfFormatString.cpp
@@ -15,6 +15,8 @@
 #include "clang/Analysis/Analyses/FormatString.h"
 #include "FormatStringParsing.h"
 #include "clang/Basic/TargetInfo.h"
+#include "llvm/Support/ConvertUTF.h"
+#include "llvm/Support/Locale.h"
 
 using clang::analyze_format_string::ArgType;
 using clang::analyze_format_string::FormatStringHandler;
@@ -312,8 +314,19 @@
     argIndex++;
 
   if (k == ConversionSpecifier::InvalidSpecifier) {
+    const UTF8 *SB = reinterpret_cast<const UTF8 *>(Start + 1);
+    const UTF8 *SE = reinterpret_cast<const UTF8 *>(E);
+    const char FirstByte = *SB;
+    unsigned Len = I - Start;
+
+    // If the specifier in non-printable, it could be the first byte of a
+    // UTF-8 sequence. If that's the case, adjust the length accordingly.
+    if (Start + 1 < I && !llvm::sys::locale::isPrint(FirstByte) &&
+        isLegalUTF8String(&SB, SE))
+      Len = getNumBytesForUTF8(FirstByte) + 1;
+
     // Assume the conversion takes one argument.
-    return !H.HandleInvalidPrintfConversionSpecifier(FS, Start, I - Start);
+    return !H.HandleInvalidPrintfConversionSpecifier(FS, Start, Len);
   }
   return PrintfSpecifierResult(Start, FS);
 }
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to