teemperor retitled this revision from "Added ASTStructure for analyzing the 
structure of Stmts." to "Added basic capabilities to detect source code 
clones.".
teemperor updated the summary for this revision.
teemperor updated this revision to Diff 62877.
teemperor added a comment.

- Patch now only provides basic generic hashing and is smaller.
- Code style is now respecting LLVM/clang guidelines.
- Documented all non-trivial functions.
- Moved testing from unittests to normal tests.
- Added tests for false-positives.
- Fixed the problems pointed out by Vassil (beside "Is that the LLVM/Clang 
common notion for documenting private members: (i.e. doxygen disabled) instead 
of /").
- No longer patching SourceManager.h.


http://reviews.llvm.org/D20795

Files:
  include/clang/AST/CloneDetection.h
  include/clang/StaticAnalyzer/Checkers/Checkers.td
  lib/AST/CMakeLists.txt
  lib/AST/CloneDetection.cpp
  lib/StaticAnalyzer/Checkers/CMakeLists.txt
  lib/StaticAnalyzer/Checkers/CloneChecker.cpp
  test/Analysis/copypaste/test-min-max.cpp
  test/Analysis/copypaste/test-sub-sequences.cpp

Index: test/Analysis/copypaste/test-sub-sequences.cpp
===================================================================
--- /dev/null
+++ test/Analysis/copypaste/test-sub-sequences.cpp
@@ -0,0 +1,28 @@
+// RUN: %clang_cc1 -analyze -std=c++11 -analyzer-checker=clone.CloneChecker -verify %s
+
+// We test if sub-sequences can match with normal sequences containing only
+// a single statement.
+
+void log2(int a);
+void log();
+
+int max(int a, int b) {
+  log2(a);
+  log(); // expected-warning{{Detected code clone.}}
+  if (a > b)
+    return a;
+  return b;
+}
+
+int maxClone(int a, int b) {
+  log(); // expected-note{{Related code clone is here.}}
+  if (a > b)
+    return a;
+  return b;
+}
+
+// Functions below are not clones and should not be reported.
+
+int foo(int a, int b) {
+  return a + b;
+}
Index: test/Analysis/copypaste/test-min-max.cpp
===================================================================
--- /dev/null
+++ test/Analysis/copypaste/test-min-max.cpp
@@ -0,0 +1,41 @@
+// RUN: %clang_cc1 -analyze -std=c++11 -analyzer-checker=clone.CloneChecker -verify %s
+
+void log();
+
+int max(int a, int b) { // expected-warning{{Detected code clone.}}
+  log();
+  if (a > b)
+    return a;
+  return b;
+}
+
+int maxClone(int a, int b) { // expected-note{{Related code clone is here.}}
+  log();
+  if (a > b)
+    return a;
+  return b;
+}
+
+// False positives below. These clones should not be reported.
+
+// FIXME: Detect different binary operator kinds.
+int min1(int a, int b) { // expected-note{{Related code clone is here.}}
+  log();
+  if (a < b)
+    return a;
+  return b;
+}
+
+// FIXME: Detect different variable patterns.
+int min2(int a, int b) { // expected-note{{Related code clone is here.}}
+  log();
+  if (b > a)
+    return a;
+  return b;
+}
+
+// Functions below are not clones and should not be reported.
+
+int foo(int a, int b) {
+  return a + b;
+}
Index: lib/StaticAnalyzer/Checkers/CloneChecker.cpp
===================================================================
--- /dev/null
+++ lib/StaticAnalyzer/Checkers/CloneChecker.cpp
@@ -0,0 +1,67 @@
+//===--- CloneDetection.cpp - Clone detection checkers ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// CloneChecker is a checker that reports clones in the current translation
+/// unit.
+///
+//===----------------------------------------------------------------------===//
+
+#include "ClangSACheckers.h"
+#include "clang/AST/CloneDetection.h"
+#include "clang/Basic/Diagnostic.h"
+#include "clang/StaticAnalyzer/Core/Checker.h"
+#include "clang/StaticAnalyzer/Core/CheckerManager.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
+
+using namespace clang;
+using namespace ento;
+
+namespace {
+class CloneChecker : public Checker<check::EndOfTranslationUnit> {
+
+public:
+  void checkEndOfTranslationUnit(const TranslationUnitDecl *TU,
+                                 AnalysisManager &Mgr, BugReporter &BR) const;
+};
+} // end anonymous namespace
+
+void CloneChecker::checkEndOfTranslationUnit(const TranslationUnitDecl *TU,
+                                             AnalysisManager &Mgr,
+                                             BugReporter &BR) const {
+  CloneDetector CloneDetector;
+  CloneDetector.AnalyzeTranslationUnitDecl(
+      TU->getASTContext().getTranslationUnitDecl());
+
+  std::vector<CloneDetector::CloneGroup> CloneGroups =
+      CloneDetector.findClones();
+
+  DiagnosticsEngine &DiagEngine = Mgr.getDiagnostic();
+
+  unsigned WarnID = DiagEngine.getCustomDiagID(DiagnosticsEngine::Warning,
+                                               "Detected code clone.");
+
+  unsigned NoteID = DiagEngine.getCustomDiagID(DiagnosticsEngine::Note,
+                                               "Related code clone is here.");
+
+  for (CloneDetector::CloneGroup &Group : CloneGroups) {
+    DiagEngine.Report(Group.front().getStartLoc(), WarnID);
+    for (unsigned J = 1; J < Group.size(); ++J) {
+      DiagEngine.Report(Group[J].getStartLoc(), NoteID);
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Register CloneChecker
+//===----------------------------------------------------------------------===//
+
+void ento::registerCloneChecker(CheckerManager &Mgr) {
+  Mgr.registerChecker<CloneChecker>();
+}
Index: lib/StaticAnalyzer/Checkers/CMakeLists.txt
===================================================================
--- lib/StaticAnalyzer/Checkers/CMakeLists.txt
+++ lib/StaticAnalyzer/Checkers/CMakeLists.txt
@@ -22,6 +22,7 @@
   CheckerDocumentation.cpp
   ChrootChecker.cpp
   ClangCheckers.cpp
+  CloneChecker.cpp
   DeadStoresChecker.cpp
   DebugCheckers.cpp
   DereferenceChecker.cpp
Index: lib/AST/CloneDetection.cpp
===================================================================
--- /dev/null
+++ lib/AST/CloneDetection.cpp
@@ -0,0 +1,310 @@
+//===--- CloneDetection.cpp - Analyses the structure of an AST --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+///  This file implements classes for searching and anlyzing source code clones.
+///
+//===----------------------------------------------------------------------===//
+
+#include "clang/AST/CloneDetection.h"
+
+#include "clang/AST/ASTContext.h"
+#include "clang/AST/RecursiveASTVisitor.h"
+#include "clang/AST/Stmt.h"
+
+using namespace clang;
+
+StmtSequence::StmtSequence(CompoundStmt *Stmt, ASTContext *Context,
+                           unsigned StartIndex, unsigned EndIndex)
+    : S(Stmt), Context(Context), StartIndex(StartIndex), EndIndex(EndIndex) {
+  assert(Stmt && Context && "Both Stmt and Context need to be valid pointers");
+  assert(StartIndex < EndIndex && "Given array should not be empty");
+  assert(EndIndex <= Stmt->size() && "Given array too big for this Stmt");
+}
+
+StmtSequence::StmtSequence(Stmt *Stmt, ASTContext *Context)
+    : S(Stmt), Context(Context), StartIndex(0), EndIndex(0) {
+  // Test against passing StmtSequence(nullptr, valid_ptr) or
+  // StmtSequence(valid_ptr, nullptr).
+  assert((!Stmt && !Context) || (Stmt && Context));
+}
+
+bool StmtSequence::contains(const StmtSequence &Other) const {
+  // If both sequences reside in different translation units, they can
+  // never contain each other.
+  if (Context != Other.Context)
+    return false;
+
+  // Otherwise check if the start and end locations of the current sequence
+  // surround the other sequence.
+  bool StartIsInBounds = Context->getSourceManager().isBeforeInTranslationUnit(
+                             getStartLoc(), Other.getStartLoc()) ||
+                         getStartLoc() == Other.getStartLoc();
+  bool EndIsInBounds = Context->getSourceManager().isBeforeInTranslationUnit(
+                           Other.getEndLoc(), getEndLoc()) ||
+                       Other.getEndLoc() == getEndLoc();
+  return StartIsInBounds && EndIsInBounds;
+}
+
+Stmt *const *StmtSequence::begin() const {
+  if (holdsSequence()) {
+    auto CS = static_cast<CompoundStmt *>(S);
+    return CS->body_begin() + StartIndex;
+  }
+  return &S;
+}
+
+Stmt *const *StmtSequence::end() const {
+  if (holdsSequence()) {
+    auto CS = static_cast<CompoundStmt *>(S);
+    return CS->body_begin() + EndIndex;
+  }
+  return &S;
+}
+
+SourceLocation StmtSequence::getStartLoc() const {
+  return front()->getLocStart();
+}
+
+SourceLocation StmtSequence::getEndLoc() const { return back()->getLocEnd(); }
+
+namespace {
+/// Utility class for computing a hash value.
+class HashValue {
+  unsigned Value = 0;
+
+public:
+  /// Computes the given data into the hash value.
+  void addData(unsigned Data) {
+    Value *= 53;
+    Value += Data;
+  }
+
+  /// Returns the hash value.
+  unsigned getValue() { return Value; }
+};
+} // end anonymous namespace
+
+namespace {
+/// Traverses the AST and calculates hash values and complexity values
+/// for all statements. The results are stored in a CloneDetector
+/// object.
+///
+/// This calculation happens in linear time and each statement is only
+/// visited a fixed amount of times during this process.
+/// The hash value for a statement is first calculated from the data stored
+/// directly in the statement object.
+/// Afterwards, the hash values of the children are calculated into the
+/// computed hash value.
+class HashVisitor : public RecursiveASTVisitor<HashVisitor> {
+
+  static const unsigned NullChildHashValue = 313;
+
+public:
+  explicit HashVisitor(CloneDetector &CD, ASTContext &Context)
+      : CD(CD), Context(Context) {}
+
+  // We need to traverse postorder over the AST for our algorithm
+  // to work as each parent expects that its children were already hashed.
+  bool shouldTraversePostOrder() const { return true; }
+
+  bool VisitStmt(Stmt *S);
+
+private:
+  void HandleChildHashes(HashValue &Hash, unsigned &Complexity,
+                         Stmt::child_iterator Iterator, unsigned StartPos,
+                         unsigned Length) {
+    auto StartIterator = Iterator;
+    for (unsigned I = 0; I < StartPos; ++I)
+      ++StartIterator;
+    auto EndIterator = Iterator;
+    for (unsigned I = 0; I < Length; ++I)
+      ++EndIterator;
+    HandleChildHashes(Hash, Complexity, StartIterator, EndIterator);
+  }
+
+  /// Iterates over the specified Stmt array and computes the hash
+  /// values of all statements into the given hash value.
+  /// Also, the complexity of all statements is added to the given
+  /// complexity value.
+  void HandleChildHashes(HashValue &Hash, unsigned &Complexity,
+                         Stmt::child_iterator StartIterator,
+                         Stmt::child_iterator EndIterator) {
+    // Iterate over each child in the sub-sequence.
+    for (auto I = StartIterator; I != EndIterator; ++I) {
+      Stmt *Child = *I;
+      if (Child == nullptr) {
+        ++Complexity;
+        // We use an placeholder hash value for null children.
+        Hash.addData(NullChildHashValue);
+      } else {
+        StmtSequence ChildSequence(StmtSequence(Child, &Context));
+
+        bool Error;
+        CloneDetector::StmtData Data = CD.findHash(ChildSequence, Error);
+        assert(!Error && "Child statement wasn't hashed before parent.");
+
+        Complexity += Data.Complexity;
+        Hash.addData(Data.Hash);
+      }
+    }
+  }
+
+  // Saves the current hash value into the persistent storage of this
+  // CloneDetector.
+  void SaveData(StmtSequence CurrentStmt, HashValue Hash,
+                       unsigned Complexity);
+
+  CloneDetector &CD;
+  ASTContext &Context;
+};
+} // end anonymous namespace
+
+bool HashVisitor::VisitStmt(Stmt *S) {
+  StmtSequence CurrentStmt(S, &Context);
+  HashValue Hash;
+  unsigned Complexity = 1;
+
+  // The only relevant data for now is the class of the statement,
+  // so we calculate the hash class into the current hash value.
+  // TODO: Handle statement class specific data.
+  Hash.addData(S->getStmtClass());
+
+  // Incorporate the hash values of all child Stmts into the current
+  // hash value.
+  HandleChildHashes(Hash, Complexity, S->child_begin(), S->child_end());
+
+  SaveData(CurrentStmt, Hash, Complexity);
+  return true;
+}
+
+void HashVisitor::SaveData(StmtSequence CurrentStmt, HashValue Hash,
+                                  unsigned Complexity) {
+  // Save current dat to the CloneDetector.
+  CloneDetector::StmtData Data(Hash.getValue(), Complexity);
+  CD.add(CurrentStmt, Data);
+
+  // If the currently hashed statement is a CompoundStmt, we also
+  // need to calculate the hash values for all possible sub-sequences
+  // in the body of the CompoundStmt.
+  auto CS = dyn_cast<CompoundStmt>(CurrentStmt.front());
+  if (!CS)
+    return;
+
+  // The length of the sub-sequence. We don't need to hash sequences
+  // with the length 1 as they are already handled when we hashed the
+  // children.
+  // We also don't need to hash the sub-sequence that contains the
+  // whole body as this would be equal to the hash of the CompoundStmt itself.
+  for (unsigned Length = 2; Length < CS->size(); ++Length) {
+    // The start index in the body of the CompoundStmt.
+    // We increase the position and rehash until the end of the sub-sequence
+    // reaches the end of the CompoundStmt body.
+    for (unsigned Pos = 0; Pos <= CS->size() - Length; ++Pos) {
+      HashValue SubHash;
+      unsigned SubComplexity = 1;
+      StmtSequence Sequence(CS, &Context, Pos, Pos + Length);
+
+      // We consider a sub-sequence to be CompoundStmt that only contains
+      // the children in the sub-sequence. This is necessary for matching
+      // sub-sequences with full CompoundStmts.
+      SubHash.addData(CS->getStmtClass());
+
+      HandleChildHashes(SubHash, SubComplexity, CS->child_begin(), Pos, Length);
+
+      CloneDetector::StmtData SubData(SubHash.getValue(), SubComplexity);
+      CD.add(Sequence, SubData);
+    }
+  }
+}
+
+void CloneDetector::AnalyzeTranslationUnitDecl(TranslationUnitDecl *TU) {
+  HashVisitor visitor(*this, TU->getASTContext());
+  visitor.TraverseDecl(TU);
+}
+
+void CloneDetector::add(const StmtSequence &S, const StmtData &Data) {
+  assert(!S.empty());
+  HashedStmts[S] = Data;
+}
+
+namespace {
+/// Returns true if and only if \p Stmt contains at least one
+/// other sequence in the \p Group.
+bool StmtContainsAnyInGroup(StmtSequence &Stmt,
+                            CloneDetector::CloneGroup &Group) {
+  for (StmtSequence &GroupStmt : Group) {
+    if (Stmt.contains(GroupStmt))
+      return true;
+  }
+  return false;
+}
+
+/// Returns true if and only if all sequences in \p OtherGroup are contained
+/// by a sequence in \p Group.
+bool GroupContains(CloneDetector::CloneGroup &Group,
+                   CloneDetector::CloneGroup &OtherGroup) {
+  // We have less sequences in the current group than we have in the other,
+  // so we will never fulfill the requirement for returning true.
+  // This is only possible because we know that a sequence in Group can
+  // contain at most one sequence in OtherGroup.
+  if (Group.size() < OtherGroup.size())
+    return false;
+
+  for (StmtSequence &Stmt : Group) {
+    if (!StmtContainsAnyInGroup(Stmt, OtherGroup))
+      return false;
+  }
+  return true;
+}
+} // end anonymous namespace
+
+std::vector<CloneDetector::CloneGroup>
+CloneDetector::findClones(unsigned MinGroupComplexity) {
+  std::vector<CloneGroup> Result;
+
+  std::map<unsigned, CloneGroup> GroupsByHash;
+
+  // Create a CloneGroup for every known statement.
+  for (auto &Pair : HashedStmts) {
+    // but ignore statements that don't fulfill the complexity requirement.
+    if (Pair.second.Complexity > MinGroupComplexity) {
+      GroupsByHash[Pair.second.Hash].push_back(Pair.first);
+    }
+  }
+
+  // Put all groups that now contain at least one sequence into the
+  // result. We still need to filter the Result.
+  for (auto &HashGroupPair : GroupsByHash) {
+    if (HashGroupPair.second.size() > 1) {
+      Result.push_back(HashGroupPair.second);
+    }
+  }
+
+  std::set<unsigned> IndexesToRemove;
+
+  // Compare every group in the result with the others. If one groups
+  // contains another group, we only need to return the bigger group.
+  for (unsigned I = 0; I < Result.size(); ++I) {
+    for (unsigned J = 0; J < Result.size(); ++J) {
+      if (I != J) {
+        if (GroupContains(Result[J], Result[I])) {
+          IndexesToRemove.insert(I);
+          break;
+        }
+      }
+    }
+  }
+
+  for (auto Iter = IndexesToRemove.rbegin(); Iter != IndexesToRemove.rend();
+       ++Iter) {
+    Result.erase(Result.begin() + (*Iter));
+  }
+
+  return Result;
+}
Index: lib/AST/CMakeLists.txt
===================================================================
--- lib/AST/CMakeLists.txt
+++ lib/AST/CMakeLists.txt
@@ -9,6 +9,7 @@
   ASTImporter.cpp
   ASTTypeTraits.cpp
   AttrImpl.cpp
+  CloneDetection.cpp
   CXXInheritance.cpp
   Comment.cpp
   CommentBriefParser.cpp
Index: include/clang/StaticAnalyzer/Checkers/Checkers.td
===================================================================
--- include/clang/StaticAnalyzer/Checkers/Checkers.td
+++ include/clang/StaticAnalyzer/Checkers/Checkers.td
@@ -77,6 +77,8 @@
 def LLVM : Package<"llvm">;
 def Debug : Package<"debug">;
 
+def CloneDetection : Package<"clone">;
+
 //===----------------------------------------------------------------------===//
 // Core Checkers.
 //===----------------------------------------------------------------------===//
@@ -657,3 +659,17 @@
   DescFile<"DebugCheckers.cpp">;
 
 } // end "debug"
+
+
+//===----------------------------------------------------------------------===//
+// Clone Detection
+//===----------------------------------------------------------------------===//
+
+let ParentPackage = CloneDetection in {
+
+def CloneChecker : Checker<"CloneChecker">,
+  HelpText<"Reports similar pieces of code.">,
+  DescFile<"CloneChecker.cpp">;
+
+} // end "clone"
+
Index: include/clang/AST/CloneDetection.h
===================================================================
--- /dev/null
+++ include/clang/AST/CloneDetection.h
@@ -0,0 +1,224 @@
+//===--- CloneDetection.h - Analyses the structure of an AST ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// /file
+/// This file defines classes for searching and anlyzing source code clones.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_AST_CLONEDETECTION_H
+#define LLVM_CLANG_AST_CLONEDETECTION_H
+
+#include "clang/Basic/SourceLocation.h"
+
+#include <map>
+#include <vector>
+
+namespace clang {
+
+class Stmt;
+class ASTContext;
+class CompoundStmt;
+class TranslationUnitDecl;
+
+/// \brief Identifies a list of statements.
+///
+/// Can either identify a single arbitrary Stmt object, a continuous sequence of
+/// child statements inside a CompoundStmt or no statements at all.
+class StmtSequence {
+  // If this object identifies a sequence of statements inside a CompoundStmt,
+  // S points to this CompoundStmt.
+  // If this object only identifies a single Stmt, then S is a pointer to this
+  // Stmt.
+  Stmt *S;
+
+  // The related ASTContext for S.
+  ASTContext *Context;
+
+  /// If EndIndex is non-zero, then S is a CompoundStmt and this StmtSequence
+  /// instance is representing the CompoundStmt children inside the array
+  /// [StartIndex, EndIndex).
+  unsigned StartIndex;
+  unsigned EndIndex;
+
+public:
+  /// \brief Constructs a StmtSequence holding multiple statements.
+  ///
+  /// The resulting StmtSequence identifies a continuous sequence of statements
+  /// in the body of the given CompoundStmt.
+  /// Which statements of the body should be identified needs to be specified
+  /// by providing a start and end index that describe a non-empty sub-array
+  /// in the body of the given CompoundStmt.
+  ///
+  /// \param Stmt A CompoundStmt that contains all statements in its body.
+  /// \param Context The ASTContext for the given CompoundStmt.
+  /// \param StartIndex the inclusive start index in the children array of
+  ///                   \p Stmt
+  /// \param EndIndex the exclusive end index in the children array of \p Stmt.
+  StmtSequence(CompoundStmt *Stmt, ASTContext *Context, unsigned StartIndex,
+               unsigned EndIndex);
+
+  /// \brief Constructs a StmtSequence holding a single statement.
+  ///
+  /// \param Stmt An arbitrary Stmt.
+  /// \param Context The ASTContext for the given Stmt.
+  StmtSequence(Stmt *Stmt, ASTContext *Context);
+
+  /// \brief Constructs an empty StmtSequence.
+  StmtSequence() : StmtSequence(nullptr, nullptr) {}
+
+  /// Returns an iterator-style pointer to the first statement in this sequence.
+  Stmt *const *begin() const;
+
+  /// Returns an iterator-style pointer that points behind the last statement
+  /// in this sequence.
+  Stmt *const *end() const;
+
+  /// Returns the first statement in this sequence.
+  /// This method should only be called on a non-empty StmtSequence object.
+  Stmt *front() const {
+    assert(!empty());
+    return begin()[0];
+  }
+
+  /// Returns the last statement in this sequence.
+  /// This method should only be called on a non-empty StmtSequence object.
+  Stmt *back() const {
+    assert(!empty());
+    return begin()[size() - 1];
+  }
+
+  /// Returns the number of statements this object holds.
+  unsigned size() const {
+    if (holdsSequence())
+      return EndIndex - StartIndex;
+    if (S == nullptr)
+      return 0;
+    return 1;
+  }
+
+  /// Returns true if and only if this StmtSequence contains no statements.
+  bool empty() const { return size() == 0; }
+
+  /// Returns the related ASTContext for the stored Stmts.
+  ASTContext &getASTContext() const {
+    assert(Context);
+    return *Context;
+  }
+
+  /// Returns true if this objects holds a list of statements.
+  bool holdsSequence() const { return EndIndex != 0; }
+
+  /// Returns the start sourcelocation of the first statement in this sequence.
+  /// This method should only be called on a non-empty StmtSequence object.
+  SourceLocation getStartLoc() const;
+
+  /// Returns the end sourcelocation of the last statement in this sequence.
+  /// This method should only be called on a non-empty StmtSequence object.
+  SourceLocation getEndLoc() const;
+
+  /// Provides an strict weak ordering operator.
+  bool operator<(const StmtSequence &Other) const {
+    return std::tie(S, StartIndex, EndIndex) <
+           std::tie(Other.S, Other.StartIndex, Other.EndIndex);
+  }
+
+  /// Returns true if and only if this sequence covers a source range that
+  /// contains the source range of the given sequence.
+  ///
+  /// This method should only be called on a non-empty StmtSequence object
+  /// and passed a non-empty StmtSequence object.
+  bool contains(const StmtSequence &Other) const;
+};
+
+/// \brief Searches clones in source code.
+///
+/// This class first needs to be provided with (possibly multiple) translation
+/// units by passing them \p AnalyzeTranslationUnitDecl .
+/// Afterwards all provided translation units can be searched for clones
+/// by calling \p findClones .
+///
+/// This class only searches for clones in exectuable source code
+/// (e.g. function bodies). Other clones (e.g. cloned comments or declarations)
+/// are not supported.
+///
+/// CloneDetector generates search data for efficiently finding clones
+/// in the given statements.
+/// This is done by hashing all statements using a locality-sensitive hash
+/// function that generates identical hash values for similar statement
+/// trees.
+class CloneDetector {
+public:
+  /// Holds the generated data for a StmtSequence
+  struct StmtData {
+    /// The generated hash value.
+    unsigned Hash;
+    /// \brief The complexity of the StmtSequence.
+    ///
+    /// This scalar values serves as a simple way of filtering clones
+    /// that are too small to be reported.
+    /// A greater value indicates that the related StmtSequence is probably
+    /// more interesting to the user.
+    unsigned Complexity;
+    StmtData() : Hash(0), Complexity(0) {}
+
+    StmtData(unsigned Hash, unsigned Complexity)
+        : Hash(Hash), Complexity(Complexity) {}
+  };
+
+  /// \brief Generates and stores search data for all statements in the
+  /// given translation unit declarations.
+  ///
+  /// This method can be called multiple times on the same CloneDetector
+  /// to search clones across translation unit boundaries.
+  void AnalyzeTranslationUnitDecl(TranslationUnitDecl *TU);
+
+  /// \brief Returns the generated search data for the given StmtSequence.
+  ///
+  /// \param S the given StmtSequence
+  /// \param Error output parameter that is set to true if no
+  ///        data was generated for the given StmtSequence.
+  ///        This can happen if the given StmtSequence is not part of a
+  ///        translation unit that was passed to \p AnalyzeTranslationUnitDecl
+  ///        so far.
+  StmtData findHash(const StmtSequence &S, bool &Error) {
+    auto I = HashedStmts.find(S);
+    if (I == HashedStmts.end()) {
+      Error = true;
+      return StmtData();
+    }
+    Error = false;
+    return I->second;
+  }
+
+  /// \brief Stores the StmtSequence with its associated data in the search
+  ///        database of this CloneDetector.
+  ///
+  /// StmtSequence objects added by this method are included in the clone
+  /// search.
+  void add(const StmtSequence &S, const StmtData &Data);
+
+  typedef std::vector<StmtSequence> CloneGroup;
+
+  /// \brief Searches all translation units in this CloneDetector for clones.
+  ///
+  /// \param MinGroupComplexity Only return clones which have at least this
+  ///                           complexity value.
+  /// \return a list of found clone groups. Each group contains multiple
+  ///         StmtSequences that were identified to be clones of each other.
+  std::vector<CloneGroup> findClones(unsigned MinGroupComplexity = 10);
+
+private:
+  // Maps each StmtSequence to the search data that was generated for it.
+  std::map<StmtSequence, StmtData> HashedStmts;
+};
+
+} // end namespace clang
+
+#endif // LLVM_CLANG_AST_CLONEDETECTION_H
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to