[PATCH] D28445: [Analyzer] Extend taint propagation and checking

2017-05-29 Thread Artem Dergachev via Phabricator via cfe-commits
NoQ added a comment.

Uhm, messed up the phabricator link in https://reviews.llvm.org/rL304162, which 
should have been pointing to https://reviews.llvm.org/D30909 but points here 
instead.


Repository:
  rL LLVM

https://reviews.llvm.org/D28445



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D28445: [Analyzer] Extend taint propagation and checking

2017-03-08 Thread Phabricator via Phabricator via cfe-commits
This revision was automatically updated to reflect the committed changes.
Closed by commit rL297326: [analyzer] Extend taint propagation and checking to 
support LazyCompoundVal (authored by zaks).

Changed prior to commit:
  https://reviews.llvm.org/D28445?vs=90868=91104#toc

Repository:
  rL LLVM

https://reviews.llvm.org/D28445

Files:
  cfe/trunk/include/clang/StaticAnalyzer/Core/PathSensitive/Store.h
  cfe/trunk/lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp
  cfe/trunk/lib/StaticAnalyzer/Core/RegionStore.cpp
  cfe/trunk/test/Analysis/taint-generic.c

Index: cfe/trunk/include/clang/StaticAnalyzer/Core/PathSensitive/Store.h
===
--- cfe/trunk/include/clang/StaticAnalyzer/Core/PathSensitive/Store.h
+++ cfe/trunk/include/clang/StaticAnalyzer/Core/PathSensitive/Store.h
@@ -59,6 +59,30 @@
   /// \return The value bound to the location \c loc.
   virtual SVal getBinding(Store store, Loc loc, QualType T = QualType()) = 0;
 
+  /// Return the default value bound to a region in a given store. The default
+  /// binding is the value of sub-regions that were not initialized separately
+  /// from their base region. For example, if the structure is zero-initialized
+  /// upon construction, this method retrieves the concrete zero value, even if
+  /// some or all fields were later overwritten manually. Default binding may be
+  /// an unknown, undefined, concrete, or symbolic value.
+  /// \param[in] store The store in which to make the lookup.
+  /// \param[in] R The region to find the default binding for.
+  /// \return The default value bound to the region in the store, if a default
+  /// binding exists.
+  virtual Optional getDefaultBinding(Store store, const MemRegion *R) = 0;
+
+  /// Return the default value bound to a LazyCompoundVal. The default binding
+  /// is used to represent the value of any fields or elements within the
+  /// structure represented by the LazyCompoundVal which were not initialized
+  /// explicitly separately from the whole structure. Default binding may be an
+  /// unknown, undefined, concrete, or symbolic value.
+  /// \param[in] lcv The lazy compound value.
+  /// \return The default value bound to the LazyCompoundVal \c lcv, if a
+  /// default binding exists.
+  Optional getDefaultBinding(nonloc::LazyCompoundVal lcv) {
+return getDefaultBinding(lcv.getStore(), lcv.getRegion());
+  }
+
   /// Return a state with the specified value bound to the given location.
   /// \param[in] store The analysis state.
   /// \param[in] loc The symbolic memory location.
Index: cfe/trunk/test/Analysis/taint-generic.c
===
--- cfe/trunk/test/Analysis/taint-generic.c
+++ cfe/trunk/test/Analysis/taint-generic.c
@@ -169,6 +169,43 @@
   sock = socket(AF_LOCAL, SOCK_STREAM, 0);
   read(sock, buffer, 100);
   execl(buffer, "filename", 0); // no-warning
+
+  sock = socket(AF_INET, SOCK_STREAM, 0);
+  // References to both buffer and  as an argument should taint the argument
+  read(sock, , 100);
+  execl(buffer, "filename", 0); // expected-warning {{Untrusted data is passed to a system call}}
+}
+
+void testStruct() {
+  struct {
+char buf[16];
+int length;
+  } tainted;
+
+  char buffer[16];
+  int sock;
+
+  sock = socket(AF_INET, SOCK_STREAM, 0);
+  read(sock, , sizeof(tainted));
+  __builtin_memcpy(buffer, tainted.buf, tainted.length); // expected-warning {{Untrusted data is used to specify the buffer size}}
+}
+
+void testStructArray() {
+  struct {
+char buf[16];
+struct {
+  int length;
+} st[1];
+  } tainted;
+
+  char buffer[16];
+  int sock;
+
+  sock = socket(AF_INET, SOCK_STREAM, 0);
+  read(sock, [0], sizeof(tainted.buf));
+  read(sock, [0], sizeof(tainted.st));
+  // FIXME: tainted.st[0].length should be marked tainted
+  __builtin_memcpy(buffer, tainted.buf, tainted.st[0].length); // no-warning
 }
 
 int testDivByZero() {
Index: cfe/trunk/lib/StaticAnalyzer/Core/RegionStore.cpp
===
--- cfe/trunk/lib/StaticAnalyzer/Core/RegionStore.cpp
+++ cfe/trunk/lib/StaticAnalyzer/Core/RegionStore.cpp
@@ -494,6 +494,11 @@
 return getBinding(getRegionBindings(S), L, T);
   }
 
+  Optional getDefaultBinding(Store S, const MemRegion *R) override {
+RegionBindingsRef B = getRegionBindings(S);
+return B.getDefaultBinding(R);
+  }
+
   SVal getBinding(RegionBindingsConstRef B, Loc L, QualType T = QualType());
 
   SVal getBindingForElement(RegionBindingsConstRef B, const ElementRegion *R);
Index: cfe/trunk/lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp
===
--- cfe/trunk/lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp
+++ cfe/trunk/lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp
@@ -65,6 +65,18 @@
   /// and thus, is tainted.
   static bool isStdin(const Expr *E, CheckerContext );
 
+  /// This is 

[PATCH] D28445: [Analyzer] Extend taint propagation and checking

2017-03-07 Thread Vlad Tsyrklevich via Phabricator via cfe-commits
vlad.tsyrklevich added a comment.

In https://reviews.llvm.org/D28445#694906, @zaks.anna wrote:

> @vlad.tsyrklevich,
>
> Do you have commit access or should we commit on your behalf?


You should commit on my behalf.


https://reviews.llvm.org/D28445



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D28445: [Analyzer] Extend taint propagation and checking

2017-03-07 Thread Anna Zaks via Phabricator via cfe-commits
zaks.anna added a comment.

@vlad.tsyrklevich,

Do you have commit access or should we commit on your behalf?


https://reviews.llvm.org/D28445



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D28445: [Analyzer] Extend taint propagation and checking

2017-03-07 Thread Artem Dergachev via Phabricator via cfe-commits
NoQ accepted this revision.
NoQ added a comment.
This revision is now accepted and ready to land.

I believe this should land. Thank you very much for getting this far to get 
this fixed.

My take on the documentation:

  Return the default value bound to a region in a given store. The default 
binding is the value of sub-regions that were not initialized separately from 
their base region. For example, if the structure is zero-initialized upon 
construction, this method retrieves the concrete zero value, even if some or 
all fields were later overwritten manually. Default binding may be an unknown, 
undefined, concrete, or symbolic value.
  \param[in] store The store in which to make the lookup.
  \param[in] R The region to find the default binding for.



  Return the default value bound to a LazyCompoundVal. The default binding is 
used to represent the value of any fields or elements within the structure 
represented by the LazyCompoundVal which were not initialized explicitly 
separately from the whole structure. Default binding may be an unknown, 
undefined, concrete, or symbolic value.
  \param[in] lcv The lazy compound value.
  \return The default value bound to the LazyCompoundVal \c lcv, if a default 
binding exists.




Comment at: lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp:455
+  // Otherwise, return a nullptr as there's not yet a functional way to taint
+  // sub-regions of LCVs.
+  return nullptr;

I'm not sure if i mentioned this before, but for this case we could store taint 
information in the program state as a map **//T//** from symbols to sets of 
regions, so that a `SymbolDerived`-class symbol with parent symbol **//S//** 
and parent region **//R//** is auto-tainted when **//R//** is a sub-region of 
at least one region **//R'//** in **//T(S)//**.

That is, if we need to taint some fields in a structure with default symbol 
**//S//**, we add the relevant field regions to **//T(S)//**, and later lookup 
if the derived symbol's parent region is within one of the 
"tainted-regions-for-that-symbol".

That's a crazy plan, but i believe it's also quite expressive, using the SVal 
hierarchy to the fullest. So it might be the way to go.


https://reviews.llvm.org/D28445



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D28445: [Analyzer] Extend taint propagation and checking

2017-02-22 Thread Vlad Tsyrklevich via Phabricator via cfe-commits
vlad.tsyrklevich updated this revision to Diff 89467.
vlad.tsyrklevich added a comment.

@NoQ I've tried to address all the issues you mentioned in this change. Let me 
know if the expanded documentation doesn't address what you were looking for.


https://reviews.llvm.org/D28445

Files:
  include/clang/StaticAnalyzer/Core/PathSensitive/Store.h
  lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp
  lib/StaticAnalyzer/Core/RegionStore.cpp
  test/Analysis/taint-generic.c

Index: test/Analysis/taint-generic.c
===
--- test/Analysis/taint-generic.c
+++ test/Analysis/taint-generic.c
@@ -169,6 +169,43 @@
   sock = socket(AF_LOCAL, SOCK_STREAM, 0);
   read(sock, buffer, 100);
   execl(buffer, "filename", 0); // no-warning
+
+  sock = socket(AF_INET, SOCK_STREAM, 0);
+  // References to both buffer and  as an argument should taint the argument
+  read(sock, , 100);
+  execl(buffer, "filename", 0); // expected-warning {{Untrusted data is passed to a system call}}
+}
+
+void testStruct() {
+  struct {
+char buf[16];
+int length;
+  } tainted;
+
+  char buffer[16];
+  int sock;
+
+  sock = socket(AF_INET, SOCK_STREAM, 0);
+  read(sock, , sizeof(tainted));
+  __builtin_memcpy(buffer, tainted.buf, tainted.length); // expected-warning {{Untrusted data is used to specify the buffer size}}
+}
+
+void testStructArray() {
+  struct {
+char buf[16];
+struct {
+  int length;
+} st[1];
+  } tainted;
+
+  char buffer[16];
+  int sock;
+
+  sock = socket(AF_INET, SOCK_STREAM, 0);
+  read(sock, [0], sizeof(tainted.buf));
+  read(sock, [0], sizeof(tainted.st));
+  // FIXME: tainted.st[0].length should be marked tainted
+  __builtin_memcpy(buffer, tainted.buf, tainted.st[0].length); // no-warning
 }
 
 int testDivByZero() {
Index: lib/StaticAnalyzer/Core/RegionStore.cpp
===
--- lib/StaticAnalyzer/Core/RegionStore.cpp
+++ lib/StaticAnalyzer/Core/RegionStore.cpp
@@ -494,6 +494,11 @@
 return getBinding(getRegionBindings(S), L, T);
   }
 
+  Optional getDefaultBinding(Store S, const MemRegion *R) override {
+RegionBindingsRef B = getRegionBindings(S);
+return B.getDefaultBinding(R);
+  }
+
   SVal getBinding(RegionBindingsConstRef B, Loc L, QualType T = QualType());
 
   SVal getBindingForElement(RegionBindingsConstRef B, const ElementRegion *R);
Index: lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp
===
--- lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp
+++ lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp
@@ -65,6 +65,18 @@
   /// and thus, is tainted.
   static bool isStdin(const Expr *E, CheckerContext );
 
+  /// This is called from getPointedToSymbol() to resolve symbol references for
+  /// the region underlying a LazyCompoundVal. This is the default binding
+  /// for the LCV, which could be a conjured symbol from a function call that
+  /// initialized the region. It only returns the conjured symbol if the LCV
+  /// covers the entire region, e.g. we avoid false positives by not returning
+  /// a default bindingc for an entire struct if the symbol for only a single
+  /// field or element within it is requested.
+  // TODO: Return an appropriate symbol for sub-fields/elements of an LCV so
+  // that they are also appropriately tainted.
+  static SymbolRef getLCVSymbol(CheckerContext ,
+nonloc::LazyCompoundVal );
+
   /// \brief Given a pointer argument, get the symbol of the value it contains
   /// (points to).
   static SymbolRef getPointedToSymbol(CheckerContext , const Expr *Arg);
@@ -423,6 +435,27 @@
   return false;
 }
 
+SymbolRef GenericTaintChecker::getLCVSymbol(CheckerContext ,
+nonloc::LazyCompoundVal ) {
+  StoreManager  = C.getStoreManager();
+
+  // getLCVSymbol() is reached in a PostStmt so we can always expect a default
+  // binding to exist if one is present.
+  if (Optional binding = StoreMgr.getDefaultBinding(LCV)) {
+SymbolRef Sym = binding->getAsSymbol();
+if (!Sym)
+  return nullptr;
+
+// If the LCV covers an entire base region return the default conjured symbol.
+if (LCV.getRegion() == LCV.getRegion()->getBaseRegion())
+  return Sym;
+  }
+
+  // Otherwise, return a nullptr as there's not yet a functional way to taint
+  // sub-regions of LCVs.
+  return nullptr;
+}
+
 SymbolRef GenericTaintChecker::getPointedToSymbol(CheckerContext ,
   const Expr* Arg) {
   ProgramStateRef State = C.getState();
@@ -438,6 +471,10 @@
 dyn_cast(Arg->getType().getCanonicalType().getTypePtr());
   SVal Val = State->getSVal(*AddrLoc,
 ArgTy ? ArgTy->getPointeeType(): QualType());
+
+  if (auto LCV = Val.getAs())
+return getLCVSymbol(C, *LCV);
+
   return Val.getAsSymbol();
 }
 
Index: 

[PATCH] D28445: [Analyzer] Extend taint propagation and checking

2017-02-20 Thread Artem Dergachev via Phabricator via cfe-commits
NoQ added a comment.

Yeah, i think this is now as easy as i expected it to be :)

Still, the new API is in need of better documentation, because the notion of 
default binding is already confusing, and the new use-case we now have for this 
API is even more confusing. I don't instantly see a good way to justify this 
hack, but some day we'd need to either do that or refactor further. The notion 
of default binding needs to become more "material".

A more expanded doxygen comment should probably be a great start.




Comment at: include/clang/StaticAnalyzer/Core/PathSensitive/Store.h:66
+  /// \return The default value bound to the LazyCompoundVal \c lcv.
+  virtual SVal getDefaultBinding(Store store, nonloc::LazyCompoundVal lcv) = 0;
+

I think there are two different use cases here:
1. `getDefaultBinding(Store, Region)` would retrieve default binding in a given 
store for the region's base region,
2. `getDefaultBinding(LazyCompoundVal)` would retrieve default binding for the 
lazy compound value's base region from the lazy compound value's store - a 
shorthand for `getDefaultBinding(lcv.getStore(), lcv.getRegion())`.

Otherwise we're passing two different stores into the function (one directly, 
another as part of the lazy compound value), and it's confusing which one is 
actually used.





Comment at: lib/StaticAnalyzer/Core/RegionStore.cpp:498
+  SVal getDefaultBinding(Store S, nonloc::LazyCompoundVal L) override {
+if (!L.getRegion())
+  return UnknownVal();

`LazyCompoundVal` always has a region.



Comment at: lib/StaticAnalyzer/Core/RegionStore.cpp:505
+
+return UnknownVal();
+  }

Because UnknownVal is an interesting default binding, quite different from lack 
of default binding, i'd rather return `Optional` from that function (and 
None here).


https://reviews.llvm.org/D28445



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D28445: [Analyzer] Extend taint propagation and checking

2017-02-15 Thread Vlad Tsyrklevich via Phabricator via cfe-commits
vlad.tsyrklevich added inline comments.



Comment at: lib/StaticAnalyzer/Core/RegionStore.cpp:502
+RegionBindingsRef B = getRegionBindings(S);
+const MemRegion *MR  = L.getRegion()->getBaseRegion();
+if (Optional V = B.getDefaultBinding(MR))

vlad.tsyrklevich wrote:
> vlad.tsyrklevich wrote:
> > a.sidorin wrote:
> > > We get the LazyCompoundVal for some region but return the symbol for its 
> > > base. It means that at least method name is very confusing.
> > I believe that default bindings are only on base regions, so if you pass a 
> > reference to `outer_struct.inner_struct` the default binding for that LCV 
> > will be over `outer_struct`. I'm basing this on other references to LCVs in 
> > Core/RegionStore.cpp but I could be wrong. Either way, I'd be happy to 
> > change the interface to have the caller pass the correct MemRegion here.
> One change we could introduce to make getDefaultBinding() more explicit is to 
> have the caller pass LCV.getRegion()->getBaseRegion() instead of just passing 
> the LCV. However, this has the disadvantage of hardcoding an implementation 
> detail of RegionStoreManager into users of the StoreManager interface. I 
> think the correct way forward here might just be to add a comment that 
> mentions that currently RegionStoreManagers bindings are only over base 
> regions. What do you think?
I spoke too soon, after digging a little deeper I realized that lookups using 
`RegionBindingsRef ::getDefaultBinding` converts to a base region check anyway 
in `BindingKey::Make`. Therefore it'd be better to just completely hide that 
implementation detail to the RegionBindings than hard coding it here 
unnecessarily. The updated revision corrects this.


https://reviews.llvm.org/D28445



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D28445: [Analyzer] Extend taint propagation and checking

2017-02-15 Thread Vlad Tsyrklevich via Phabricator via cfe-commits
vlad.tsyrklevich updated this revision to Diff 88493.
vlad.tsyrklevich added a comment.

Remove an unnecessary call to `getBaseRegion()`, remove the logic to create a 
new SymbolDerived as it's currently unused, and add a comment to reflect that 
`getLCVSymbol()` is called in a PostStmt and a default binding should always be 
present.


https://reviews.llvm.org/D28445

Files:
  include/clang/StaticAnalyzer/Core/PathSensitive/Store.h
  lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp
  lib/StaticAnalyzer/Core/RegionStore.cpp
  test/Analysis/taint-generic.c

Index: test/Analysis/taint-generic.c
===
--- test/Analysis/taint-generic.c
+++ test/Analysis/taint-generic.c
@@ -169,6 +169,43 @@
   sock = socket(AF_LOCAL, SOCK_STREAM, 0);
   read(sock, buffer, 100);
   execl(buffer, "filename", 0); // no-warning
+
+  sock = socket(AF_INET, SOCK_STREAM, 0);
+  // References to both buffer and  as an argument should taint the argument
+  read(sock, , 100);
+  execl(buffer, "filename", 0); // expected-warning {{Untrusted data is passed to a system call}}
+}
+
+void testStruct() {
+  struct {
+char buf[16];
+int length;
+  } tainted;
+
+  char buffer[16];
+  int sock;
+
+  sock = socket(AF_INET, SOCK_STREAM, 0);
+  read(sock, , sizeof(tainted));
+  __builtin_memcpy(buffer, tainted.buf, tainted.length); // expected-warning {{Untrusted data is used to specify the buffer size}}
+}
+
+void testStructArray() {
+  struct {
+char buf[16];
+struct {
+  int length;
+} st[1];
+  } tainted;
+
+  char buffer[16];
+  int sock;
+
+  sock = socket(AF_INET, SOCK_STREAM, 0);
+  read(sock, [0], sizeof(tainted.buf));
+  read(sock, [0], sizeof(tainted.st));
+  // FIXME: tainted.st[0].length should be marked tainted
+  __builtin_memcpy(buffer, tainted.buf, tainted.st[0].length); // no-warning
 }
 
 int testDivByZero() {
Index: lib/StaticAnalyzer/Core/RegionStore.cpp
===
--- lib/StaticAnalyzer/Core/RegionStore.cpp
+++ lib/StaticAnalyzer/Core/RegionStore.cpp
@@ -494,6 +494,17 @@
 return getBinding(getRegionBindings(S), L, T);
   }
 
+  SVal getDefaultBinding(Store S, nonloc::LazyCompoundVal L) override {
+if (!L.getRegion())
+  return UnknownVal();
+
+RegionBindingsRef B = getRegionBindings(S);
+if (Optional V = B.getDefaultBinding(L.getRegion()))
+  return *V;
+
+return UnknownVal();
+  }
+
   SVal getBinding(RegionBindingsConstRef B, Loc L, QualType T = QualType());
 
   SVal getBindingForElement(RegionBindingsConstRef B, const ElementRegion *R);
Index: lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp
===
--- lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp
+++ lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp
@@ -65,6 +65,10 @@
   /// and thus, is tainted.
   static bool isStdin(const Expr *E, CheckerContext );
 
+  /// \brief Get the symbol for the region underlying a LazyCompoundVal.
+  static SymbolRef getLCVSymbol(CheckerContext ,
+nonloc::LazyCompoundVal );
+
   /// \brief Given a pointer argument, get the symbol of the value it contains
   /// (points to).
   static SymbolRef getPointedToSymbol(CheckerContext , const Expr *Arg);
@@ -423,6 +427,25 @@
   return false;
 }
 
+SymbolRef GenericTaintChecker::getLCVSymbol(CheckerContext ,
+nonloc::LazyCompoundVal ) {
+  StoreManager  = C.getStoreManager();
+
+  // getLCVSymbol() is reached in a PostStmt so we can always expect a default
+  // binding to exist if one is present.
+  SymbolRef Sym = StoreMgr.getDefaultBinding(LCV.getStore(), LCV).getAsSymbol();
+  if (!Sym)
+return nullptr;
+
+  // If the LCV covers an entire base region return the default conjured symbol.
+  if (LCV.getRegion() == LCV.getRegion()->getBaseRegion())
+return Sym;
+
+  // Otherwise, return a nullptr as there's not yet a functional way to taint
+  // sub-regions of LCVs.
+  return nullptr;
+}
+
 SymbolRef GenericTaintChecker::getPointedToSymbol(CheckerContext ,
   const Expr* Arg) {
   ProgramStateRef State = C.getState();
@@ -438,6 +461,10 @@
 dyn_cast(Arg->getType().getCanonicalType().getTypePtr());
   SVal Val = State->getSVal(*AddrLoc,
 ArgTy ? ArgTy->getPointeeType(): QualType());
+
+  if (auto LCV = Val.getAs())
+return getLCVSymbol(C, *LCV);
+
   return Val.getAsSymbol();
 }
 
Index: include/clang/StaticAnalyzer/Core/PathSensitive/Store.h
===
--- include/clang/StaticAnalyzer/Core/PathSensitive/Store.h
+++ include/clang/StaticAnalyzer/Core/PathSensitive/Store.h
@@ -59,6 +59,12 @@
   /// \return The value bound to the location \c loc.
   virtual SVal getBinding(Store store, Loc loc, QualType T = QualType()) = 0;
 
+  /// Return the 

[PATCH] D28445: [Analyzer] Extend taint propagation and checking

2017-02-15 Thread Vlad Tsyrklevich via Phabricator via cfe-commits
vlad.tsyrklevich added inline comments.



Comment at: lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp:442
+
+const RecordDecl *RD = RT->getDecl()->getDefinition();
+for (const auto *I : RD->fields()) {

a.sidorin wrote:
> vlad.tsyrklevich wrote:
> > a.sidorin wrote:
> > > NoQ wrote:
> > > > We need to be careful in the case when we don't have the definition in 
> > > > the current translation unit. In this case we may still have derived 
> > > > symbols by casting the pointer into some blindly guessed type, which 
> > > > may be primitive or having well-defined primitive fields.
> > > > 
> > > > By the way, in D26837 i'm suspecting that there are other errors of 
> > > > this kind in the checker, eg. when a function returns a void pointer, 
> > > > we put taint on symbols of type "void", which is weird.
> > > > 
> > > > Adding Alexey who may recall something on this topic.
> > > I will check the correctness of this code sample because I have some 
> > > doubts about it.
> > > The problem of casts is hard because of our approach to put taints on 0th 
> > > elements. We lose casts and may get some strange void symbols. However, I 
> > > guess this patch isn't related to this problem directly.
> > Not sure which form of correctness you're interested in here but I'll bring 
> > up one issue I'm aware of: currently this will create a new SymbolDerived 
> > for an LCV sub-region, but it won't be correctly matched against in 
> > `isTainted()` because subsequent references to the same region will have a 
> > different SymbolDerived. This is the FIXME I mentioned below in 
> > `taint-generic.c` I have some idea on how to fix this but I think it will 
> > probably require more back and forth, hence why I didn't include it in this 
> > change. As it stands now, the sub-region tainting could be removed without 
> > changing the functionality of the current patch.
> Checking a default binding symbol here works because we're in PostStmt of the 
> call that creates this default binding. I think this machinery deserves a 
> comment, it was not evident for me initially.
> However, I don't like to create a SymbolDerived manually. The first idea is 
> to just use getSVal(MemRegion*) which will return a SymbolDerived if it is. 
> The second is to create... SymbolMetadata that will carry a taint (if the 
> value is not a symbol, for example). Not sure if second idea is sane enough, 
> I need some comments on it. Artem, Anna?
> Also, instead of referring to the base region, we may need to walk parents 
> one-by-one.
I'd rather just get rid of the tainted SymbolDerived and re-introduce it for 
discussion in the follow-up change as it's currently not functional anyway and 
there is some room for debate on the correct way to make tainting of 
sub-regions of LCVs work correctly. 

As far as walking parents one-by-one, my current understanding of the 
RegionStoreManager led me to believe that that would be unnecessary. Currently, 
all bindings are made as offsets from a base region, reference the 
implementation of `RegionBindingsRef::addBinding`. Is there another reason to 
walk the parents that I'm missing?



Comment at: lib/StaticAnalyzer/Core/RegionStore.cpp:502
+RegionBindingsRef B = getRegionBindings(S);
+const MemRegion *MR  = L.getRegion()->getBaseRegion();
+if (Optional V = B.getDefaultBinding(MR))

vlad.tsyrklevich wrote:
> a.sidorin wrote:
> > We get the LazyCompoundVal for some region but return the symbol for its 
> > base. It means that at least method name is very confusing.
> I believe that default bindings are only on base regions, so if you pass a 
> reference to `outer_struct.inner_struct` the default binding for that LCV 
> will be over `outer_struct`. I'm basing this on other references to LCVs in 
> Core/RegionStore.cpp but I could be wrong. Either way, I'd be happy to change 
> the interface to have the caller pass the correct MemRegion here.
One change we could introduce to make getDefaultBinding() more explicit is to 
have the caller pass LCV.getRegion()->getBaseRegion() instead of just passing 
the LCV. However, this has the disadvantage of hardcoding an implementation 
detail of RegionStoreManager into users of the StoreManager interface. I think 
the correct way forward here might just be to add a comment that mentions that 
currently RegionStoreManagers bindings are only over base regions. What do you 
think?


https://reviews.llvm.org/D28445



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D28445: [Analyzer] Extend taint propagation and checking

2017-02-14 Thread Aleksei Sidorin via Phabricator via cfe-commits
a.sidorin added a comment.

Sorry for the delay, I have commented inline.
For me, this patch looks like a strict improvement. I think, after some clean 
up this can be accepted.




Comment at: lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp:442
+
+const RecordDecl *RD = RT->getDecl()->getDefinition();
+for (const auto *I : RD->fields()) {

vlad.tsyrklevich wrote:
> a.sidorin wrote:
> > NoQ wrote:
> > > We need to be careful in the case when we don't have the definition in 
> > > the current translation unit. In this case we may still have derived 
> > > symbols by casting the pointer into some blindly guessed type, which may 
> > > be primitive or having well-defined primitive fields.
> > > 
> > > By the way, in D26837 i'm suspecting that there are other errors of this 
> > > kind in the checker, eg. when a function returns a void pointer, we put 
> > > taint on symbols of type "void", which is weird.
> > > 
> > > Adding Alexey who may recall something on this topic.
> > I will check the correctness of this code sample because I have some doubts 
> > about it.
> > The problem of casts is hard because of our approach to put taints on 0th 
> > elements. We lose casts and may get some strange void symbols. However, I 
> > guess this patch isn't related to this problem directly.
> Not sure which form of correctness you're interested in here but I'll bring 
> up one issue I'm aware of: currently this will create a new SymbolDerived for 
> an LCV sub-region, but it won't be correctly matched against in `isTainted()` 
> because subsequent references to the same region will have a different 
> SymbolDerived. This is the FIXME I mentioned below in `taint-generic.c` I 
> have some idea on how to fix this but I think it will probably require more 
> back and forth, hence why I didn't include it in this change. As it stands 
> now, the sub-region tainting could be removed without changing the 
> functionality of the current patch.
Checking a default binding symbol here works because we're in PostStmt of the 
call that creates this default binding. I think this machinery deserves a 
comment, it was not evident for me initially.
However, I don't like to create a SymbolDerived manually. The first idea is to 
just use getSVal(MemRegion*) which will return a SymbolDerived if it is. The 
second is to create... SymbolMetadata that will carry a taint (if the value is 
not a symbol, for example). Not sure if second idea is sane enough, I need some 
comments on it. Artem, Anna?
Also, instead of referring to the base region, we may need to walk parents 
one-by-one.


https://reviews.llvm.org/D28445



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D28445: [Analyzer] Extend taint propagation and checking

2017-02-08 Thread Vlad Tsyrklevich via Phabricator via cfe-commits
vlad.tsyrklevich added inline comments.



Comment at: lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp:442
+
+const RecordDecl *RD = RT->getDecl()->getDefinition();
+for (const auto *I : RD->fields()) {

a.sidorin wrote:
> NoQ wrote:
> > We need to be careful in the case when we don't have the definition in the 
> > current translation unit. In this case we may still have derived symbols by 
> > casting the pointer into some blindly guessed type, which may be primitive 
> > or having well-defined primitive fields.
> > 
> > By the way, in D26837 i'm suspecting that there are other errors of this 
> > kind in the checker, eg. when a function returns a void pointer, we put 
> > taint on symbols of type "void", which is weird.
> > 
> > Adding Alexey who may recall something on this topic.
> I will check the correctness of this code sample because I have some doubts 
> about it.
> The problem of casts is hard because of our approach to put taints on 0th 
> elements. We lose casts and may get some strange void symbols. However, I 
> guess this patch isn't related to this problem directly.
Not sure which form of correctness you're interested in here but I'll bring up 
one issue I'm aware of: currently this will create a new SymbolDerived for an 
LCV sub-region, but it won't be correctly matched against in `isTainted()` 
because subsequent references to the same region will have a different 
SymbolDerived. This is the FIXME I mentioned below in `taint-generic.c` I have 
some idea on how to fix this but I think it will probably require more back and 
forth, hence why I didn't include it in this change. As it stands now, the 
sub-region tainting could be removed without changing the functionality of the 
current patch.



Comment at: lib/StaticAnalyzer/Core/RegionStore.cpp:502
+RegionBindingsRef B = getRegionBindings(S);
+const MemRegion *MR  = L.getRegion()->getBaseRegion();
+if (Optional V = B.getDefaultBinding(MR))

a.sidorin wrote:
> We get the LazyCompoundVal for some region but return the symbol for its 
> base. It means that at least method name is very confusing.
I believe that default bindings are only on base regions, so if you pass a 
reference to `outer_struct.inner_struct` the default binding for that LCV will 
be over `outer_struct`. I'm basing this on other references to LCVs in 
Core/RegionStore.cpp but I could be wrong. Either way, I'd be happy to change 
the interface to have the caller pass the correct MemRegion here.


https://reviews.llvm.org/D28445



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D28445: [Analyzer] Extend taint propagation and checking

2017-02-07 Thread Aleksei Sidorin via Phabricator via cfe-commits
a.sidorin added a comment.

Hi Artem,
Thank you for adding me, I missed this patch. I have few comments below. If you 
(and Vlad) can wait for two or  three days, I will re-check the place I'm 
worrying about and post the results.




Comment at: lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp:442
+
+const RecordDecl *RD = RT->getDecl()->getDefinition();
+for (const auto *I : RD->fields()) {

NoQ wrote:
> We need to be careful in the case when we don't have the definition in the 
> current translation unit. In this case we may still have derived symbols by 
> casting the pointer into some blindly guessed type, which may be primitive or 
> having well-defined primitive fields.
> 
> By the way, in D26837 i'm suspecting that there are other errors of this kind 
> in the checker, eg. when a function returns a void pointer, we put taint on 
> symbols of type "void", which is weird.
> 
> Adding Alexey who may recall something on this topic.
I will check the correctness of this code sample because I have some doubts 
about it.
The problem of casts is hard because of our approach to put taints on 0th 
elements. We lose casts and may get some strange void symbols. However, I guess 
this patch isn't related to this problem directly.



Comment at: lib/StaticAnalyzer/Core/RegionStore.cpp:502
+RegionBindingsRef B = getRegionBindings(S);
+const MemRegion *MR  = L.getRegion()->getBaseRegion();
+if (Optional V = B.getDefaultBinding(MR))

We get the LazyCompoundVal for some region but return the symbol for its base. 
It means that at least method name is very confusing.


https://reviews.llvm.org/D28445



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D28445: [Analyzer] Extend taint propagation and checking

2017-02-06 Thread Vlad Tsyrklevich via Phabricator via cfe-commits
vlad.tsyrklevich updated this revision to Diff 87237.
vlad.tsyrklevich marked 4 inline comments as done.
vlad.tsyrklevich added a comment.

As Artem mentioned, I reworked `getLCVSymbol()` to get the default bindings 
directly from the StoreManager which required adding a new method. I also 
reverted the tainting logic to add taint more conservatively. I have another 
change that will fix the new FIXME test that is added, but it might involved 
some refactoring and discussion so I've left it out to make this change as 
simple as possible.


https://reviews.llvm.org/D28445

Files:
  include/clang/StaticAnalyzer/Core/PathSensitive/Store.h
  lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp
  lib/StaticAnalyzer/Core/RegionStore.cpp
  test/Analysis/taint-generic.c

Index: test/Analysis/taint-generic.c
===
--- test/Analysis/taint-generic.c
+++ test/Analysis/taint-generic.c
@@ -169,6 +169,43 @@
   sock = socket(AF_LOCAL, SOCK_STREAM, 0);
   read(sock, buffer, 100);
   execl(buffer, "filename", 0); // no-warning
+
+  sock = socket(AF_INET, SOCK_STREAM, 0);
+  // References to both buffer and  as an argument should taint the argument
+  read(sock, , 100);
+  execl(buffer, "filename", 0); // expected-warning {{Untrusted data is passed to a system call}}
+}
+
+void testStruct() {
+  struct {
+char buf[16];
+int length;
+  } tainted;
+
+  char buffer[16];
+  int sock;
+
+  sock = socket(AF_INET, SOCK_STREAM, 0);
+  read(sock, , sizeof(tainted));
+  __builtin_memcpy(buffer, tainted.buf, tainted.length); // expected-warning {{Untrusted data is used to specify the buffer size}}
+}
+
+void testStructArray() {
+  struct {
+char buf[16];
+struct {
+  int length;
+} st[1];
+  } tainted;
+
+  char buffer[16];
+  int sock;
+
+  sock = socket(AF_INET, SOCK_STREAM, 0);
+  read(sock, [0], sizeof(tainted.buf));
+  read(sock, [0], sizeof(tainted.st));
+  // FIXME: tainted.st[0].length should be marked tainted
+  __builtin_memcpy(buffer, tainted.buf, tainted.st[0].length); // no-warning
 }
 
 int testDivByZero() {
Index: lib/StaticAnalyzer/Core/RegionStore.cpp
===
--- lib/StaticAnalyzer/Core/RegionStore.cpp
+++ lib/StaticAnalyzer/Core/RegionStore.cpp
@@ -494,6 +494,18 @@
 return getBinding(getRegionBindings(S), L, T);
   }
 
+  SVal getDefaultBinding(Store S, nonloc::LazyCompoundVal L) override {
+if (!L.getRegion())
+  return UnknownVal();
+
+RegionBindingsRef B = getRegionBindings(S);
+const MemRegion *MR  = L.getRegion()->getBaseRegion();
+if (Optional V = B.getDefaultBinding(MR))
+  return *V;
+
+return UnknownVal();
+  }
+
   SVal getBinding(RegionBindingsConstRef B, Loc L, QualType T = QualType());
 
   SVal getBindingForElement(RegionBindingsConstRef B, const ElementRegion *R);
Index: lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp
===
--- lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp
+++ lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp
@@ -65,6 +65,10 @@
   /// and thus, is tainted.
   static bool isStdin(const Expr *E, CheckerContext );
 
+  /// \brief Get the symbol for the region underlying a LazyCompoundVal.
+  static SymbolRef getLCVSymbol(CheckerContext ,
+nonloc::LazyCompoundVal );
+
   /// \brief Given a pointer argument, get the symbol of the value it contains
   /// (points to).
   static SymbolRef getPointedToSymbol(CheckerContext , const Expr *Arg);
@@ -423,6 +427,22 @@
   return false;
 }
 
+SymbolRef GenericTaintChecker::getLCVSymbol(CheckerContext ,
+nonloc::LazyCompoundVal ) {
+  StoreManager  = C.getStoreManager();
+  SymbolRef Sym = StoreMgr.getDefaultBinding(LCV.getStore(), LCV).getAsSymbol();
+  if (!Sym)
+return nullptr;
+
+  // If the LCV covers an entire base region, return the default conjured symbol
+  if (LCV.getRegion() == LCV.getRegion()->getBaseRegion())
+return Sym;
+
+  // Otherwise, return a derived symbol indicating only a sub-region is tainted
+  SymbolManager  = C.getSymbolManager();
+  return SM.getDerivedSymbol(Sym, LCV.getRegion());
+}
+
 SymbolRef GenericTaintChecker::getPointedToSymbol(CheckerContext ,
   const Expr* Arg) {
   ProgramStateRef State = C.getState();
@@ -438,6 +458,10 @@
 dyn_cast(Arg->getType().getCanonicalType().getTypePtr());
   SVal Val = State->getSVal(*AddrLoc,
 ArgTy ? ArgTy->getPointeeType(): QualType());
+
+  if (auto LCV = Val.getAs())
+return getLCVSymbol(C, *LCV);
+
   return Val.getAsSymbol();
 }
 
Index: include/clang/StaticAnalyzer/Core/PathSensitive/Store.h
===
--- include/clang/StaticAnalyzer/Core/PathSensitive/Store.h
+++ 

[PATCH] D28445: [Analyzer] Extend taint propagation and checking

2017-02-01 Thread Artem Dergachev via Phabricator via cfe-commits
NoQ added a subscriber: a.sidorin.
NoQ added a comment.

Hello! Sorry, i'm very distracted recently.

I think your new approach should work, however it would be much easier and more 
straightforward to just ask the store manager to provide the default-bound 
symbol for a region directly, instead of trying to derive from it manually and 
then underive it back. In inline comments i also show that sometimes we're not 
really that much interested in the particular derived symbols.




Comment at: lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp:442
+
+const RecordDecl *RD = RT->getDecl()->getDefinition();
+for (const auto *I : RD->fields()) {

We need to be careful in the case when we don't have the definition in the 
current translation unit. In this case we may still have derived symbols by 
casting the pointer into some blindly guessed type, which may be primitive or 
having well-defined primitive fields.

By the way, in D26837 i'm suspecting that there are other errors of this kind 
in the checker, eg. when a function returns a void pointer, we put taint on 
symbols of type "void", which is weird.

Adding Alexey who may recall something on this topic.



Comment at: lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp:455
+  } else if (T->isArrayType()) {
+const ArrayType *AT = T->getAsArrayTypeUnsafe();
+const ElementRegion *ER =

The "safe" way to do this is to use one of the `ASTContext`'s 
`get***ArrayType()` methods.



Comment at: lib/StaticAnalyzer/Core/ProgramState.cpp:667
 
+  // If this a derived symbol, taint the parent symbol.
+  if (const SymbolDerived *SD = dyn_cast(Sym))

While this is quite vague, i can imagine us wanting to taint only a particular 
field of a symbolic structure. Considering that taint also automatically 
propagates in the opposite direction (from parent symbol to derived symbol), 
i'd rather preserve some freedom by moving this code to the taint-checker (and 
other places where we may want to generate taint; for now, it's 
`getLCVSymbol()`), forcing them to manually lookup the parent symbol and put 
taint there when they want to.



Comment at: test/Analysis/taint-tester.c:53
   scanf("%d", );
   int tx = xy.x; // expected-warning + {{tainted}}
+  int ty = xy.y; // expected-warning + {{tainted}}

I don't think you'd be able to pass this test by tweaking the taint mechanisms 
alone. The original FIXME here appears because the second `scanf()` destroys 
the first `scanf()`'s binding, together with the default-bound symbol. There's 
no way you preserve taint on `y` without adding taint on `z`, unless you model 
`scanf()` to update only specific store bindings.

In any case, this makes a good example for my comment in `addTaint()`. Because 
false positives are worse than false negatives, we shouldn't add taint to the 
default-bound symbol here.


https://reviews.llvm.org/D28445



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D28445: [Analyzer] Extend taint propagation and checking

2017-01-26 Thread Vlad Tsyrklevich via Phabricator via cfe-commits
vlad.tsyrklevich added a comment.

Hello @NoQ, I just wanted to ping you on the updated change since I'm not sure 
if you saw it.


https://reviews.llvm.org/D28445



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D28445: [Analyzer] Extend taint propagation and checking

2017-01-15 Thread Vlad Tsyrklevich via Phabricator via cfe-commits
vlad.tsyrklevich updated this revision to Diff 84517.
vlad.tsyrklevich added a comment.

Artem, thank you for the very detailed reply! Between this, and hitting the 
right search terms to find your clang analyzer guide my understanding of the 
symbol abstractions the analyzer exposes has improved significantly.

You state that it's not easy to derive the conjured symbols from the Store; 
however, it didn't seem to be too difficult to do by recursively finding the 
bindings for the constituent FieldRegions (if the LCV is backing a 
struct/union) or the first ElementRegion (if the LCV is backing an array) until 
you reach an element/field initialized with the conjured symbol. Does the new 
patch look correct to you? Your comment about the difficulty has me unsure 
whether I've fully grasped the scope of the problem.

One wrinkle you'll notice is in the patch to `taint-tester.c`, one FIXME for 
missing taint has been fixed, but now one variable that should not be tainted 
is! This seems to be because of overeager invalidation, not strictly the fault 
of the patch but it is exposed by it.


https://reviews.llvm.org/D28445

Files:
  lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp
  lib/StaticAnalyzer/Core/ProgramState.cpp
  test/Analysis/taint-generic.c
  test/Analysis/taint-tester.c

Index: test/Analysis/taint-tester.c
===
--- test/Analysis/taint-tester.c
+++ test/Analysis/taint-tester.c
@@ -51,8 +51,9 @@
   scanf("%d", );
   scanf("%d", );
   int tx = xy.x; // expected-warning + {{tainted}}
-  int ty = xy.y; // FIXME: This should be tainted as well.
-  char ntz = xy.z;// no warning
+  int ty = xy.y; // expected-warning + {{tainted}}
+  // FIXME: xy.z should not be tainted
+  char ntz = xy.z; // expected-warning + {{tainted}}
   // Now, scanf scans both.
   scanf("%d %d", , );
   int ttx = xy.x; // expected-warning + {{tainted}}
Index: test/Analysis/taint-generic.c
===
--- test/Analysis/taint-generic.c
+++ test/Analysis/taint-generic.c
@@ -169,6 +169,26 @@
   sock = socket(AF_LOCAL, SOCK_STREAM, 0);
   read(sock, buffer, 100);
   execl(buffer, "filename", 0); // no-warning
+
+  sock = socket(AF_INET, SOCK_STREAM, 0);
+  // References to both buffer and  as an argument should taint the argument
+  read(sock, , 100);
+  execl(buffer, "filename", 0); // expected-warning {{Untrusted data is passed to a system call}}
+}
+
+void testStruct() {
+  struct {
+struct {} st;
+char buf[16];
+int length;
+  } tainted;
+
+  char buffer[16];
+  int sock;
+
+  sock = socket(AF_INET, SOCK_STREAM, 0);
+  read(sock, , sizeof(tainted));
+  __builtin_memcpy(buffer, tainted.buf, tainted.length); // expected-warning {{Untrusted data is used to specify the buffer size}}
 }
 
 int testDivByZero() {
Index: lib/StaticAnalyzer/Core/ProgramState.cpp
===
--- lib/StaticAnalyzer/Core/ProgramState.cpp
+++ lib/StaticAnalyzer/Core/ProgramState.cpp
@@ -664,6 +664,10 @@
   while (const SymbolCast *SC = dyn_cast(Sym))
 Sym = SC->getOperand();
 
+  // If this a derived symbol, taint the parent symbol.
+  if (const SymbolDerived *SD = dyn_cast(Sym))
+Sym = SD->getParentSymbol();
+
   ProgramStateRef NewState = set(Sym, Kind);
   assert(NewState);
   return NewState;
Index: lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp
===
--- lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp
+++ lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp
@@ -65,6 +65,11 @@
   /// and thus, is tainted.
   static bool isStdin(const Expr *E, CheckerContext );
 
+  /// \brief Given a LazyCompoundValue, get the symbol of the first FieldRegion/
+  /// ElementRegion we can find a binding for.
+  static SymbolRef getLCVSymbol(CheckerContext ,
+nonloc::LazyCompoundVal );
+
   /// \brief Given a pointer argument, get the symbol of the value it contains
   /// (points to).
   static SymbolRef getPointedToSymbol(CheckerContext , const Expr *Arg);
@@ -423,6 +428,43 @@
   return false;
 }
 
+SymbolRef GenericTaintChecker::getLCVSymbol(CheckerContext ,
+nonloc::LazyCompoundVal ) {
+  StoreManager  = C.getStoreManager();
+  MemRegionManager  = C.getSValBuilder().getRegionManager();
+
+  QualType T = LCV.getRegion()->getValueType();
+  if (T->isStructureType() || T->isUnionType()) {
+const RecordType *RT = T->getAsStructureType();
+if (!RT)
+  RT = T->getAsUnionType();
+
+const RecordDecl *RD = RT->getDecl()->getDefinition();
+for (const auto *I : RD->fields()) {
+  const FieldRegion *FR = MrMgr.getFieldRegion(I, LCV.getRegion());
+  const SVal  =
+  StoreMgr.getBinding(LCV.getStore(), loc::MemRegionVal(FR));
+  if (auto Sym = V.getAsSymbol())
+return Sym;
+  else if (auto _LCV = 

[PATCH] D28445: [Analyzer] Extend taint propagation and checking

2017-01-10 Thread Artem Dergachev via Phabricator via cfe-commits
NoQ added a comment.

Thanks for working on the taint! I really wish the taint analysis in the 
analyzer to flourish, and the part you've digged into is particularly 
sensitive, so i'd dump some thoughts here, hopefully helpful.

**What works as it should: **

> In the second example, the SVal obtained in 
> `GenericTaintChecker::getPointedToSymbol()` is a `LazyCompoundVal` so 
> `SVal::getAsSymbol()` returns a NULL SymbolRef, meaning the symbol is not 
> tainted.

Only symbols currently can, and in my opinion ever should, directly carry taint.

We have symbols, regions, concrete values, and compound values.

A symbol //$s// denotes a particular but unknown value from a certain set 
//**S**// (say, of all integers or all pointers); we are not generally sure if 
it can denote any value from //**S**//, or only from a smaller subset 
//**S'**// within //**S**// (for example, the analyzer may be unaware that 
//$s// is always positive, because the small part of the code it's looking into 
doesn't give any hints). Analyzer also gathers constraints //**C**// when he 
picks execution paths - for instance, on a branch condition "//if ($s < 5)//", 
if we pick the true branch, //**C**// gets cropped into //[-INT_MIN, 4]//, and 
the possible values of //$s// stay within //**S'**// intersected with 
//**C**//, while the analyzer thinks that possible values of //$s// are within 
//**S**// intersected with //**C**//.

A tainted symbol merely corresponds to the special case when we know that 
//**S'** = **S**//, which we normally don't - that is, the analyzer knows that 
for some reason the symbol's value is so much chaotic that it may definitely 
take any value from //**S**// in run-time. The analyzer can know it by seeing 
that the symbol comes from an "untrusted source" such as from "outside".

We do not really talk about tainted regions - instead, we talk about tainted 
pointer values, which are symbols. In this sense, in code

  char buffer[100];

the region of variable `buffer` cannot be tainted. No matter what we do, the 
buffer region itself comes from a perfectly trusted source - it's always in the 
same well-known segment of memory (stack or in global memory). In practice this 
means that the attacker cannot affect or forge the positioning of this segment 
in any way. On the other hand, if //$i// is a tainted index symbol, then region 
//buffer[$i]// of the //$i//'th element would be "said to be tainted" - in the 
sense, its pointer is known to possibly take any value, not necessarily within 
range of the buffer. Similarly, if a pointer-symbol is tainted, the segment of 
memory (region) it points to (called `SymbolicRegion`) is "said to be tainted". 
So //whenever we're talking about a "tainted region", it's always a shorthand 
for talking about a certain tainted symbol//.

Concrete values shouldn't be tainted for the same reason - they are already 
"known", their //**S**// would consist of one element, so there's little 
interest in saying that they can take any value from this set - they're bound 
to take that only value.

Now, we have (possibly lazy) compound values. The whole point of compound 
values is that they consist of multiple other values, and therefore there's 
usually little sense in even considering them as a whole. Some sub-values of 
them may be null, over-constrained, uninitialized, irrelevant (eg. padding), or 
tainted, while other sub-values may be completely different. In your example, 
you obtain a compound value of an array, which consists of all values of all 
elements of the array. There's no point in asking if the compound value itself 
is tainted; instead, you might be interested in particular elements. For 
example, `buffer[1]` may be tainted, while `buffer[2]` was overwritten with 
`'\0'` and is no longer tainted. If you had a structure, you may ask if a 
certain field is tainted, or if all fields are tainted.

**What works but not as it should:**

> In the first example, buf has it's symbol correctly extracted (via 
> `GenericTaintChecker::getPointedToSymbol())` as a 
> `SymbolDerived{conj_$N{char}, element{buf,0 S64b,char}}`.

Nope, unfortunately, in fact this is not correctly extracted either. This 
symbol denotes the value of the first element of the buffer. The taint checker 
marks the first element as tainted, and then later reads that the first element 
is tainted, and throws the warning. However, if we pass, say, `buffer+1` to 
`execl`, it breaks.

The root cause of the problem is how the analyzer denotes casted/typed 
pointers: the `SVal` for a pointer of type `T*` is said to be the element 
region with index 0 of the region behind the pointer. You can read this as "The 
pointer points to the first `T` on this segment of memory". In this case, we 
try to say that the value behind the pointer is tainted, and we forget that the 
value behind the pointer is not only the first element of the array, but the 
whole array, even though the pointer points to the first element only.

[PATCH] D28445: [Analyzer] Extend taint propagation and checking

2017-01-10 Thread Vlad Tsyrklevich via Phabricator via cfe-commits
vlad.tsyrklevich added inline comments.



Comment at: lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp:443
+  if (auto LCV = Val.getAs())
+return C.getSymbolManager().getRegionValueSymbol(LCV->getRegion());
+

zaks.anna wrote:
> This might create a new symbol. Is this what we want?
I'm not sure how to turn an LCV into a proper symbol, so without creating new 
symbols the best approach I can see is changing `getPointedToSymbol()` to 
`getPointedToSval()` and also update `addTaint()` and `isTainted()` to accept 
SVals. Then you could have separate TaintMaps that include both symbols and 
regions and check both for taintedness. Does that sound like the correct 
approach to you?


https://reviews.llvm.org/D28445



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D28445: [Analyzer] Extend taint propagation and checking

2017-01-09 Thread Anna Zaks via Phabricator via cfe-commits
zaks.anna added a comment.

Thanks for working on this!




Comment at: lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp:443
+  if (auto LCV = Val.getAs())
+return C.getSymbolManager().getRegionValueSymbol(LCV->getRegion());
+

This might create a new symbol. Is this what we want?



Comment at: lib/StaticAnalyzer/Core/ProgramState.cpp:694
+  if (const TypedValueRegion *TVR = dyn_cast(Reg)) {
+SymbolRef Sym = getSymbolManager().getRegionValueSymbol(TVR);
+

This might create a new symbol as well. Is this what we want here?



https://reviews.llvm.org/D28445



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D28445: [Analyzer] Extend taint propagation and checking

2017-01-07 Thread Vlad Tsyrklevich via Phabricator via cfe-commits
vlad.tsyrklevich created this revision.
vlad.tsyrklevich added reviewers: zaks.anna, cfe-commits.

While extending the GenericTaintChecker for my purposes I'm hitting a case 
where taint is not propagated where I believe it should be. Currently, the 
following example will propagate taint correctly:

  char buf[16];
  taint_source(buf);
  taint_sink(buf);

However, the following fails:

  char buf[16];
  taint_source();
  taint_sink(buf);

In the first example, buf has it's symbol correctly extracted (via 
`GenericTaintChecker::getPointedToSymbol()`) as a  
`SymbolDerived{conj_$N{char}, element{buf,0 S64b,char}}`, it's marked as 
tainted and then the taint check correctly finds it using 
`ProgramState::isTainted()`.

In the second example, the SVal obtained in 
`GenericTaintChecker::getPointedToSymbol()` is a LazyCompoundVal so 
`SVal::getAsSymbol()` returns a NULL SymbolRef, meaning the symbol is not 
tainted.

This change extends GenericTaintChecker to obtain a SymbolRegionValue from the 
LCV MemRegion in `getPointedToSymbol()`, and then extends 
`ProgramState::isTainted()` to correctly match a `SymbolRegionValue{buf}` 
against a `SymbolDerived{conj_$N{char}, element{buf,0 S64b,char}}`.

I'm not familiar enough with analyzer internals to be confident in whether this 
is the right approach to fix this bug, so feedback is welcome.


https://reviews.llvm.org/D28445

Files:
  lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp
  lib/StaticAnalyzer/Core/ProgramState.cpp
  test/Analysis/taint-generic.c


Index: test/Analysis/taint-generic.c
===
--- test/Analysis/taint-generic.c
+++ test/Analysis/taint-generic.c
@@ -169,6 +169,11 @@
   sock = socket(AF_LOCAL, SOCK_STREAM, 0);
   read(sock, buffer, 100);
   execl(buffer, "filename", 0); // no-warning
+
+  sock = socket(AF_INET, SOCK_STREAM, 0);
+  // References to both buffer and  as an argument should taint the 
argument
+  read(sock, , 100);
+  execl(buffer, "filename", 0); // expected-warning {{Untrusted data is passed 
to a system call}}
 }
 
 int testDivByZero() {
Index: lib/StaticAnalyzer/Core/ProgramState.cpp
===
--- lib/StaticAnalyzer/Core/ProgramState.cpp
+++ lib/StaticAnalyzer/Core/ProgramState.cpp
@@ -690,6 +690,15 @@
   if (!Reg)
 return false;
 
+  if (const TypedValueRegion *TVR = dyn_cast(Reg)) {
+SymbolRef Sym = getSymbolManager().getRegionValueSymbol(TVR);
+
+// We don't call isTainted(Sym, K) here because it could lead to infinite 
recursion.
+const TaintTagType *Tag = get(Sym);
+if (Tag && *Tag == K)
+  return true;
+  }
+
   // Element region (array element) is tainted if either the base or the offset
   // are tainted.
   if (const ElementRegion *ER = dyn_cast(Reg))
@@ -720,7 +729,8 @@
 
 // If this is a SymbolDerived with a tainted parent, it's also tainted.
 if (const SymbolDerived *SD = dyn_cast(*SI))
-  Tainted = Tainted || isTainted(SD->getParentSymbol(), Kind);
+  Tainted = Tainted || isTainted(SD->getParentSymbol(), Kind)
+|| isTainted(SD->getOriginRegion(), Kind);
 
 // If memory region is tainted, data is also tainted.
 if (const SymbolRegionValue *SRV = dyn_cast(*SI))
Index: lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp
===
--- lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp
+++ lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp
@@ -438,6 +438,10 @@
 dyn_cast(Arg->getType().getCanonicalType().getTypePtr());
   SVal Val = State->getSVal(*AddrLoc,
 ArgTy ? ArgTy->getPointeeType(): QualType());
+
+  if (auto LCV = Val.getAs())
+return C.getSymbolManager().getRegionValueSymbol(LCV->getRegion());
+
   return Val.getAsSymbol();
 }
 


Index: test/Analysis/taint-generic.c
===
--- test/Analysis/taint-generic.c
+++ test/Analysis/taint-generic.c
@@ -169,6 +169,11 @@
   sock = socket(AF_LOCAL, SOCK_STREAM, 0);
   read(sock, buffer, 100);
   execl(buffer, "filename", 0); // no-warning
+
+  sock = socket(AF_INET, SOCK_STREAM, 0);
+  // References to both buffer and  as an argument should taint the argument
+  read(sock, , 100);
+  execl(buffer, "filename", 0); // expected-warning {{Untrusted data is passed to a system call}}
 }
 
 int testDivByZero() {
Index: lib/StaticAnalyzer/Core/ProgramState.cpp
===
--- lib/StaticAnalyzer/Core/ProgramState.cpp
+++ lib/StaticAnalyzer/Core/ProgramState.cpp
@@ -690,6 +690,15 @@
   if (!Reg)
 return false;
 
+  if (const TypedValueRegion *TVR = dyn_cast(Reg)) {
+SymbolRef Sym = getSymbolManager().getRegionValueSymbol(TVR);
+
+// We don't call isTainted(Sym, K) here because it could lead to infinite recursion.
+const TaintTagType *Tag = get(Sym);
+if (Tag &&