Re: [PATCH] D20795: Added basic capabilities to detect source code clones.

Raphael Isemann via cfe-commits Fri, 08 Jul 2016 11:35:33 -0700

teemperor updated this revision to Diff 63266.
teemperor added a comment.

- StmtSequence is now treating all Stmt objects as const.



http://reviews.llvm.org/D20795

Files:
  include/clang/AST/CloneDetection.h
  include/clang/StaticAnalyzer/Checkers/Checkers.td
  lib/AST/CMakeLists.txt
  lib/AST/CloneDetection.cpp
  lib/StaticAnalyzer/Checkers/CMakeLists.txt
  lib/StaticAnalyzer/Checkers/CloneChecker.cpp
  test/Analysis/copypaste/test-min-max.cpp
  test/Analysis/copypaste/test-sub-sequences.cpp

Index: test/Analysis/copypaste/test-sub-sequences.cpp
===================================================================
--- /dev/null
+++ test/Analysis/copypaste/test-sub-sequences.cpp
@@ -0,0 +1,28 @@
+// RUN: %clang_cc1 -analyze -std=c++11 -analyzer-checker=clone.CloneChecker -verify %s
+
+// We test if sub-sequences can match with normal sequences containing only
+// a single statement.
+
+void log2(int a);
+void log();
+
+int max(int a, int b) {
+  log2(a);
+  log(); // expected-warning{{Detected code clone.}}
+  if (a > b)
+    return a;
+  return b;
+}
+
+int maxClone(int a, int b) {
+  log(); // expected-note{{Related code clone is here.}}
+  if (a > b)
+    return a;
+  return b;
+}
+
+// Functions below are not clones and should not be reported.
+
+int foo(int a, int b) {
+  return a + b;
+}
Index: test/Analysis/copypaste/test-min-max.cpp
===================================================================
--- /dev/null
+++ test/Analysis/copypaste/test-min-max.cpp
@@ -0,0 +1,41 @@
+// RUN: %clang_cc1 -analyze -std=c++11 -analyzer-checker=clone.CloneChecker -verify %s
+
+void log();
+
+int max(int a, int b) { // expected-warning{{Detected code clone.}}
+  log();
+  if (a > b)
+    return a;
+  return b;
+}
+
+int maxClone(int a, int b) { // expected-note{{Related code clone is here.}}
+  log();
+  if (a > b)
+    return a;
+  return b;
+}
+
+// False positives below. These clones should not be reported.
+
+// FIXME: Detect different binary operator kinds.
+int min1(int a, int b) { // expected-note{{Related code clone is here.}}
+  log();
+  if (a < b)
+    return a;
+  return b;
+}
+
+// FIXME: Detect different variable patterns.
+int min2(int a, int b) { // expected-note{{Related code clone is here.}}
+  log();
+  if (b > a)
+    return a;
+  return b;
+}
+
+// Functions below are not clones and should not be reported.
+
+int foo(int a, int b) {
+  return a + b;
+}
Index: lib/StaticAnalyzer/Checkers/CloneChecker.cpp
===================================================================
--- /dev/null
+++ lib/StaticAnalyzer/Checkers/CloneChecker.cpp
@@ -0,0 +1,67 @@
+//===--- CloneDetection.cpp - Clone detection checkers ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// CloneChecker is a checker that reports clones in the current translation
+/// unit.
+///
+//===----------------------------------------------------------------------===//
+
+#include "ClangSACheckers.h"
+#include "clang/AST/CloneDetection.h"
+#include "clang/Basic/Diagnostic.h"
+#include "clang/StaticAnalyzer/Core/Checker.h"
+#include "clang/StaticAnalyzer/Core/CheckerManager.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
+
+using namespace clang;
+using namespace ento;
+
+namespace {
+class CloneChecker : public Checker<check::EndOfTranslationUnit> {
+
+public:
+  void checkEndOfTranslationUnit(const TranslationUnitDecl *TU,
+                                 AnalysisManager &Mgr, BugReporter &BR) const;
+};
+} // end anonymous namespace
+
+void CloneChecker::checkEndOfTranslationUnit(const TranslationUnitDecl *TU,
+                                             AnalysisManager &Mgr,
+                                             BugReporter &BR) const {
+  CloneDetector CloneDetector;
+  CloneDetector.AnalyzeTranslationUnitDecl(
+      TU->getASTContext().getTranslationUnitDecl());
+
+  std::vector<CloneDetector::CloneGroup> CloneGroups;
+  CloneDetector.findClones(CloneGroups);
+
+  DiagnosticsEngine &DiagEngine = Mgr.getDiagnostic();
+
+  unsigned WarnID = DiagEngine.getCustomDiagID(DiagnosticsEngine::Warning,
+                                               "Detected code clone.");
+
+  unsigned NoteID = DiagEngine.getCustomDiagID(DiagnosticsEngine::Note,
+                                               "Related code clone is here.");
+
+  for (CloneDetector::CloneGroup &Group : CloneGroups) {
+    DiagEngine.Report(Group.front().getStartLoc(), WarnID);
+    for (unsigned J = 1; J < Group.size(); ++J) {
+      DiagEngine.Report(Group[J].getStartLoc(), NoteID);
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Register CloneChecker
+//===----------------------------------------------------------------------===//
+
+void ento::registerCloneChecker(CheckerManager &Mgr) {
+  Mgr.registerChecker<CloneChecker>();
+}
Index: lib/StaticAnalyzer/Checkers/CMakeLists.txt
===================================================================
--- lib/StaticAnalyzer/Checkers/CMakeLists.txt
+++ lib/StaticAnalyzer/Checkers/CMakeLists.txt
@@ -22,6 +22,7 @@
   CheckerDocumentation.cpp
   ChrootChecker.cpp
   ClangCheckers.cpp
+  CloneChecker.cpp
   DeadStoresChecker.cpp
   DebugCheckers.cpp
   DereferenceChecker.cpp
Index: lib/AST/CloneDetection.cpp
===================================================================
--- /dev/null
+++ lib/AST/CloneDetection.cpp
@@ -0,0 +1,348 @@
+//===--- CloneDetection.cpp - Analyses the structure of an AST --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+///  This file implements classes for searching and anlyzing source code clones.
+///
+//===----------------------------------------------------------------------===//
+
+#include "clang/AST/CloneDetection.h"
+
+#include "clang/AST/ASTContext.h"
+#include "clang/AST/RecursiveASTVisitor.h"
+#include "clang/AST/Stmt.h"
+
+using namespace clang;
+
+StmtSequence::StmtSequence(CompoundStmt const *Stmt, ASTContext &Context,
+                           unsigned StartIndex, unsigned EndIndex)
+    : S(Stmt), Context(&Context), StartIndex(StartIndex), EndIndex(EndIndex) {
+  assert(Stmt && "Stmt must not be a nullptr");
+  assert(StartIndex < EndIndex && "Given array should not be empty");
+  assert(EndIndex <= Stmt->size() && "Given array too big for this Stmt");
+}
+
+StmtSequence::StmtSequence(Stmt const *Stmt, ASTContext &Context)
+    : S(Stmt), Context(&Context), StartIndex(0), EndIndex(0) {}
+
+StmtSequence::StmtSequence()
+    : S(nullptr), Context(nullptr), StartIndex(0), EndIndex(0) {}
+
+bool StmtSequence::contains(const StmtSequence &Other) const {
+  // If both sequences reside in different translation units, they can never
+  // contain each other.
+  if (Context != Other.Context)
+    return false;
+
+  const SourceManager &SM = Context->getSourceManager();
+
+  // Otherwise check if the start and end locations of the current sequence
+  // surround the other sequence.
+  bool StartIsInBounds =
+      SM.isBeforeInTranslationUnit(getStartLoc(), Other.getStartLoc()) ||
+      getStartLoc() == Other.getStartLoc();
+  bool EndIsInBounds =
+      SM.isBeforeInTranslationUnit(Other.getEndLoc(), getEndLoc()) ||
+      Other.getEndLoc() == getEndLoc();
+  return StartIsInBounds && EndIsInBounds;
+}
+
+StmtSequence::iterator StmtSequence::begin() const {
+  if (holdsSequence()) {
+    auto CS = cast<CompoundStmt>(S);
+    return CS->body_begin() + StartIndex;
+  }
+  // StmtSequence accepts only non-const Stmt objects, so this is safe.
+  return &S;
+}
+
+StmtSequence::iterator StmtSequence::end() const {
+  if (holdsSequence()) {
+    auto CS = cast<CompoundStmt>(S);
+    return CS->body_begin() + EndIndex;
+  }
+  // StmtSequence accepts only non-const Stmt objects, so this is safe.
+  return &S + 1;
+}
+
+SourceLocation StmtSequence::getStartLoc() const {
+  return front()->getLocStart();
+}
+
+SourceLocation StmtSequence::getEndLoc() const { return back()->getLocEnd(); }
+
+namespace {
+/// Utility class for computing a hash value.
+///
+/// This class takes a series of integer inputs and hashes them into a single
+/// integer hash value.
+///
+/// The used hash function is a generic hash function that is usually
+/// used to hash character strings or (serialized) tree data structures.
+/// It is defined as:
+///   H(D) = PrimeFactor^(N-1) * D[0] + PrimeFactor^(N-2) * D[1] + ... + D[N]
+/// where
+///   H is the calculated hash value.
+///   PrimeFactor is the used prime factor.
+///   D is the input data as a zero-indexed array.
+///   N is the length of the array D.
+///
+/// Typical useage when input data D = {23, 54, 3}:
+/// \code
+///   HashValue H;
+///   H.addData(23);
+///   H.addData(54);
+///   H.addData(3);
+///   unsigned HashValue = H.getValue();
+/// \endcode
+class HashValue {
+  unsigned Value = 0;
+
+  /// The PrimeFactor variable as used in the hash function.
+  ///
+  /// The value of this factor influences the unintended collision rate and
+  /// therefore also the performance of the clone search.
+  ///
+  /// TODO: Right now this is just an arbitrary prime number. It should be
+  /// replaced in the future with prime number that yields the lowest unintended
+  /// collision rate. One way to find a better factor is to run the
+  /// CloneDetector over a big code base with several randomly choosen
+  /// PrimeFactors and calculate the number of unintended collisions for each
+  /// candidate.
+  static const unsigned PrimeFactor = 53;
+
+public:
+  /// Computes the given data into the hash value.
+  ///
+  /// This method needs to be called for each input integer in the input data
+  /// set.
+  void addData(unsigned Data) {
+    // We calculate the new hash value as defined by the hash function.
+    Value *= PrimeFactor;
+    Value += Data;
+  }
+
+  /// Returns the calculated hash value.
+  unsigned getValue() { return Value; }
+};
+} // end anonymous namespace
+
+namespace {
+/// Traverses the AST and calculates hash values and complexity values for all
+/// statements. The results are stored in a CloneDetector object.
+///
+/// This calculation happens in linear time and each statement is only visited a
+/// fixed amount of times during this process. The hash value for a statement is
+/// first calculated from the data stored directly in the statement object.
+/// Afterwards, the hash values of the children are calculated into the
+/// computed hash value.
+class HashVisitor : public RecursiveASTVisitor<HashVisitor> {
+
+  /// Defines a placeholder hash value for missing child statements.
+  ///
+  /// The value of this constant should be a prime integer that is unlikely
+  /// to be generated by the HashValue class to prevent unintended hash
+  /// value collisions.
+  static const unsigned NullChildHashValue = 313;
+
+  CloneDetector &CD;
+  ASTContext &Context;
+
+public:
+  explicit HashVisitor(CloneDetector &CD, ASTContext &Context)
+      : CD(CD), Context(Context) {}
+
+  /// Overwritten function that the Visitor will traverse in postorder.
+  ///
+  /// We need to traverse postorder over the AST for our algorithm
+  /// to work as each parent expects that its children were already hashed.
+  bool shouldTraversePostOrder() const { return true; }
+
+  bool VisitStmt(Stmt *S);
+
+private:
+  void HandleChildHashes(HashValue &Hash, unsigned &Complexity,
+                         Stmt::const_child_iterator Iterator, unsigned StartPos,
+                         unsigned Length) {
+    auto StartIterator = Iterator;
+    for (unsigned I = 0; I < StartPos; ++I)
+      ++StartIterator;
+    auto EndIterator = Iterator;
+    for (unsigned I = 0; I < Length; ++I)
+      ++EndIterator;
+    Stmt::const_child_range Children(StartIterator, EndIterator);
+    HandleChildHashes(Hash, Complexity, Children);
+  }
+
+  /// Iterates over the specified Stmt array and computes the hash values of all
+  /// statements into the given hash value. Also, the complexity of all
+  /// statements is added to the given complexity value.
+  void HandleChildHashes(HashValue &Hash, unsigned &Complexity,
+                         Stmt::const_child_range Children) {
+    // Iterate over each child in the sub-sequence.
+    for (Stmt const *Child : Children) {
+      if (Child == nullptr) {
+        ++Complexity;
+        // We use an placeholder hash value for null children.
+        Hash.addData(NullChildHashValue);
+      } else {
+        StmtSequence ChildSequence(StmtSequence(Child, Context));
+
+        bool Error;
+        CloneDetector::StmtData Data = CD.findHash(ChildSequence, Error);
+        assert(!Error && "Child statement wasn't hashed before parent.");
+
+        Complexity += Data.Complexity;
+        Hash.addData(Data.Hash);
+      }
+    }
+  }
+
+  /// Saves the current hash value into the persistent storage of this
+  /// CloneDetector.
+  void SaveData(StmtSequence CurrentStmt, HashValue Hash, unsigned Complexity);
+};
+} // end anonymous namespace
+
+bool HashVisitor::VisitStmt(Stmt *S) {
+  StmtSequence CurrentStmt(S, Context);
+  HashValue Hash;
+  unsigned Complexity = 1;
+
+  // The only relevant data for now is the class of the statement, so we
+  // calculate the hash class into the current hash value.
+  // TODO: Handle statement class specific data.
+  Hash.addData(S->getStmtClass());
+
+  // Incorporate the hash values of all child Stmts into the current hash value.
+  HandleChildHashes(Hash, Complexity, S->children());
+
+  SaveData(CurrentStmt, Hash, Complexity);
+  return true;
+}
+
+void HashVisitor::SaveData(StmtSequence CurrentStmt, HashValue Hash,
+                           unsigned Complexity) {
+  // Save current data to the CloneDetector.
+  CloneDetector::StmtData Data(Hash.getValue(), Complexity);
+  CD.add(CurrentStmt, Data);
+
+  // If the currently hashed statement is a CompoundStmt, we also need to
+  // calculate the hash values for all possible sub-sequences in the body of
+  // the CompoundStmt.
+  auto CS = dyn_cast<CompoundStmt>(CurrentStmt.front());
+  if (!CS)
+    return;
+
+  // The length of the sub-sequence. We don't need to hash sequences with the
+  // length 1 as they are already handled when we hashed the children. We also
+  // don't need to hash the sub-sequence that contains the whole body as this
+  // would be equal to the hash of the CompoundStmt itself.
+  for (unsigned Length = 2; Length < CS->size(); ++Length) {
+    // The start index in the body of the CompoundStmt.
+    // We increase the position and rehash until the end of the sub-sequence
+    // reaches the end of the CompoundStmt body.
+    for (unsigned Pos = 0; Pos <= CS->size() - Length; ++Pos) {
+      HashValue SubHash;
+      unsigned SubComplexity = 1;
+      StmtSequence Sequence(CS, Context, Pos, Pos + Length);
+
+      // We consider a sub-sequence to be CompoundStmt that only contains
+      // the children in the sub-sequence. This is necessary for matching
+      // sub-sequences with full CompoundStmts.
+      SubHash.addData(CS->getStmtClass());
+
+      HandleChildHashes(SubHash, SubComplexity, CS->child_begin(), Pos, Length);
+
+      CloneDetector::StmtData SubData(SubHash.getValue(), SubComplexity);
+      CD.add(Sequence, SubData);
+    }
+  }
+}
+
+void CloneDetector::AnalyzeTranslationUnitDecl(TranslationUnitDecl *TU) {
+  HashVisitor visitor(*this, TU->getASTContext());
+  visitor.TraverseDecl(TU);
+}
+
+void CloneDetector::add(const StmtSequence &S, const StmtData &Data) {
+  assert(!S.empty());
+  HashedStmts[S] = Data;
+}
+
+namespace {
+/// Returns true if and only if \p Stmt contains at least one other sequence in
+/// the \p Group.
+bool StmtContainsAnyInGroup(StmtSequence &Stmt,
+                            CloneDetector::CloneGroup &Group) {
+  for (StmtSequence &GroupStmt : Group) {
+    if (Stmt.contains(GroupStmt))
+      return true;
+  }
+  return false;
+}
+
+/// Returns true if and only if all sequences in \p OtherGroup are contained
+/// by a sequence in \p Group.
+bool GroupContains(CloneDetector::CloneGroup &Group,
+                   CloneDetector::CloneGroup &OtherGroup) {
+  // We have less sequences in the current group than we have in the other,
+  // so we will never fulfill the requirement for returning true. This is only
+  // possible because we know that a sequence in Group can contain at most
+  // one sequence in OtherGroup.
+  if (Group.size() < OtherGroup.size())
+    return false;
+
+  for (StmtSequence &Stmt : Group) {
+    if (!StmtContainsAnyInGroup(Stmt, OtherGroup))
+      return false;
+  }
+  return true;
+}
+} // end anonymous namespace
+
+void CloneDetector::findClones(std::vector<CloneGroup> &Result,
+                               unsigned MinGroupComplexity) {
+
+  std::map<unsigned, CloneGroup> GroupsByHash;
+
+  // Create a CloneGroup for every known statement.
+  for (auto &Pair : HashedStmts) {
+    // But we ignore statements that don't fulfill the complexity requirement.
+    if (Pair.second.Complexity > MinGroupComplexity) {
+      GroupsByHash[Pair.second.Hash].push_back(Pair.first);
+    }
+  }
+
+  // Put all groups that now contain at least two sequences into the result
+  // vector. We still need to filter the results at this point.
+  for (auto &HashGroupPair : GroupsByHash) {
+    if (HashGroupPair.second.size() > 1) {
+      Result.push_back(HashGroupPair.second);
+    }
+  }
+
+  std::set<unsigned> IndexesToRemove;
+
+  // Compare every group in the result with the others. If one groups contains
+  // another group, we only need to return the bigger group.
+  for (unsigned i = 0; i < Result.size(); ++i) {
+    for (unsigned j = 0; j < Result.size(); ++j) {
+      if (i != j) {
+        if (GroupContains(Result[j], Result[i])) {
+          IndexesToRemove.insert(i);
+          break;
+        }
+      }
+    }
+  }
+
+  for (auto I = IndexesToRemove.rbegin(); I != IndexesToRemove.rend(); ++I) {
+    Result.erase(Result.begin() + (*I));
+  }
+}
Index: lib/AST/CMakeLists.txt
===================================================================
--- lib/AST/CMakeLists.txt
+++ lib/AST/CMakeLists.txt
@@ -9,6 +9,7 @@
   ASTImporter.cpp
   ASTTypeTraits.cpp
   AttrImpl.cpp
+  CloneDetection.cpp
   CXXInheritance.cpp
   Comment.cpp
   CommentBriefParser.cpp
Index: include/clang/StaticAnalyzer/Checkers/Checkers.td
===================================================================
--- include/clang/StaticAnalyzer/Checkers/Checkers.td
+++ include/clang/StaticAnalyzer/Checkers/Checkers.td
@@ -77,6 +77,8 @@
 def LLVM : Package<"llvm">;
 def Debug : Package<"debug">;
 
+def CloneDetection : Package<"clone">;
+
 //===----------------------------------------------------------------------===//
 // Core Checkers.
 //===----------------------------------------------------------------------===//
@@ -657,3 +659,17 @@
   DescFile<"DebugCheckers.cpp">;
 
 } // end "debug"
+
+
+//===----------------------------------------------------------------------===//
+// Clone Detection
+//===----------------------------------------------------------------------===//
+
+let ParentPackage = CloneDetection in {
+
+def CloneChecker : Checker<"CloneChecker">,
+  HelpText<"Reports similar pieces of code.">,
+  DescFile<"CloneChecker.cpp">;
+
+} // end "clone"
+
Index: include/clang/AST/CloneDetection.h
===================================================================
--- /dev/null
+++ include/clang/AST/CloneDetection.h
@@ -0,0 +1,226 @@
+//===--- CloneDetection.h - Analyses the structure of an AST ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// /file
+/// This file defines classes for searching and anlyzing source code clones.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_AST_CLONEDETECTION_H
+#define LLVM_CLANG_AST_CLONEDETECTION_H
+
+#include "clang/Basic/SourceLocation.h"
+
+#include <map>
+#include <vector>
+
+namespace clang {
+
+class Stmt;
+class ASTContext;
+class CompoundStmt;
+class TranslationUnitDecl;
+
+/// \brief Identifies a list of statements.
+///
+/// Can either identify a single arbitrary Stmt object, a continuous sequence of
+/// child statements inside a CompoundStmt or no statements at all.
+class StmtSequence {
+  /// If this object identifies a sequence of statements inside a CompoundStmt,
+  /// S points to this CompoundStmt. If this object only identifies a single
+  /// Stmt, then S is a pointer to this Stmt.
+  Stmt const *S;
+
+  /// The related ASTContext for S.
+  ASTContext *Context;
+
+  /// If EndIndex is non-zero, then S is a CompoundStmt and this StmtSequence
+  /// instance is representing the CompoundStmt children inside the array
+  /// [StartIndex, EndIndex).
+  unsigned StartIndex;
+  unsigned EndIndex;
+
+public:
+  /// \brief Constructs a StmtSequence holding multiple statements.
+  ///
+  /// The resulting StmtSequence identifies a continuous sequence of statements
+  /// in the body of the given CompoundStmt. Which statements of the body should
+  /// be identified needs to be specified by providing a start and end index
+  /// that describe a non-empty sub-array in the body of the given CompoundStmt.
+  ///
+  /// \param Stmt A CompoundStmt that contains all statements in its body.
+  /// \param Context The ASTContext for the given CompoundStmt.
+  /// \param StartIndex The inclusive start index in the children array of
+  ///                   \p Stmt
+  /// \param EndIndex The exclusive end index in the children array of \p Stmt.
+  StmtSequence(CompoundStmt const *Stmt, ASTContext &Context,
+               unsigned StartIndex, unsigned EndIndex);
+
+  /// \brief Constructs a StmtSequence holding a single statement.
+  ///
+  /// \param Stmt An arbitrary Stmt.
+  /// \param Context The ASTContext for the given Stmt.
+  StmtSequence(Stmt const *Stmt, ASTContext &Context);
+
+  /// \brief Constructs an empty StmtSequence.
+  StmtSequence();
+
+  typedef Stmt const *const *iterator;
+
+  /// Returns an iterator pointing to the first statement in this sequence.
+  iterator begin() const;
+
+  /// Returns an iterator pointing behind the last statement in this sequence.
+  iterator end() const;
+
+  /// Returns the first statement in this sequence.
+  ///
+  /// This method should only be called on a non-empty StmtSequence object.
+  Stmt const *front() const {
+    assert(!empty());
+    return begin()[0];
+  }
+
+  /// Returns the last statement in this sequence.
+  ///
+  /// This method should only be called on a non-empty StmtSequence object.
+  Stmt const *back() const {
+    assert(!empty());
+    return begin()[size() - 1];
+  }
+
+  /// Returns the number of statements this object holds.
+  unsigned size() const {
+    if (holdsSequence())
+      return EndIndex - StartIndex;
+    if (S == nullptr)
+      return 0;
+    return 1;
+  }
+
+  /// Returns true if and only if this StmtSequence contains no statements.
+  bool empty() const { return size() == 0; }
+
+  /// Returns the related ASTContext for the stored Stmts.
+  ASTContext &getASTContext() const {
+    assert(Context);
+    return *Context;
+  }
+
+  /// Returns true if this objects holds a list of statements.
+  bool holdsSequence() const { return EndIndex != 0; }
+
+  /// Returns the start sourcelocation of the first statement in this sequence.
+  ///
+  /// This method should only be called on a non-empty StmtSequence object.
+  SourceLocation getStartLoc() const;
+
+  /// Returns the end sourcelocation of the last statement in this sequence.
+  ///
+  /// This method should only be called on a non-empty StmtSequence object.
+  SourceLocation getEndLoc() const;
+
+  /// Provides a strict weak ordering operator.
+  bool operator<(const StmtSequence &Other) const {
+    return std::tie(S, StartIndex, EndIndex) <
+           std::tie(Other.S, Other.StartIndex, Other.EndIndex);
+  }
+
+  /// Returns true if and only if this sequence covers a source range that
+  /// contains the source range of the given sequence \p Other.
+  ///
+  /// This method should only be called on a non-empty StmtSequence object
+  /// and passed a non-empty StmtSequence object.
+  bool contains(const StmtSequence &Other) const;
+};
+
+/// \brief Searches for clones in source code.
+///
+/// This class first needs to be provided with (possibly multiple) translation
+/// units by passing them to \p AnalyzeTranslationUnitDecl .
+/// Afterwards all provided translation units can be searched for clones
+/// by calling \p findClones .
+///
+/// This class only searches for clones in exectuable source code
+/// (e.g. function bodies). Other clones (e.g. cloned comments or declarations)
+/// are not supported.
+///
+/// CloneDetector generates search data for efficiently finding clones
+/// in the given statements. This is done by hashing all statements using a
+/// locality-sensitive hash function that generates identical hash values for
+/// similar statement trees.
+class CloneDetector {
+public:
+  /// Holds the generated data for a StmtSequence
+  struct StmtData {
+    /// The generated hash value.
+    unsigned Hash;
+    /// \brief The complexity of the StmtSequence.
+    ///
+    /// This scalar values serves as a simple way of filtering clones
+    /// that are too small to be reported. A greater value indicates that the
+    /// related StmtSequence is probably more interesting to the user.
+    unsigned Complexity;
+    StmtData() : Hash(0), Complexity(0) {}
+
+    StmtData(unsigned Hash, unsigned Complexity)
+        : Hash(Hash), Complexity(Complexity) {}
+  };
+
+  /// \brief Generates and stores search data for all statements in the
+  /// given translation unit declarations.
+  ///
+  /// This method can be called multiple times on the same CloneDetector
+  /// to search clones across translation unit boundaries.
+  void AnalyzeTranslationUnitDecl(TranslationUnitDecl *TU);
+
+  /// \brief Returns the generated search data for the given StmtSequence.
+  ///
+  /// \param S The given StmtSequence
+  /// \param Error Output parameter that is set to true if no data was generated
+  ///        for the given StmtSequence. This can happen if the given
+  ///        StmtSequence is not part of a translation unit that was passed to
+  ///        \p AnalyzeTranslationUnitDecl so far.
+  StmtData findHash(const StmtSequence &S, bool &Error) {
+    auto I = HashedStmts.find(S);
+    if (I == HashedStmts.end()) {
+      Error = true;
+      return StmtData();
+    }
+    Error = false;
+    return I->second;
+  }
+
+  /// \brief Stores the StmtSequence with its associated data in the search
+  ///        database of this CloneDetector.
+  ///
+  /// StmtSequence objects added by this method are included in the clone
+  /// search.
+  void add(const StmtSequence &S, const StmtData &Data);
+
+  typedef std::vector<StmtSequence> CloneGroup;
+
+  /// \brief Searches all translation units in this CloneDetector for clones.
+  ///
+  /// \param Result Output parameter that is filled with a list of found
+  ///               clone groups. Each group contains multiple StmtSequences
+  ///               that were identified to be clones of each other.
+  /// \param MinGroupComplexity Only return clones which have at least this
+  ///                           complexity value.
+  void findClones(std::vector<CloneGroup> &Result,
+                  unsigned MinGroupComplexity = 10);
+
+private:
+  /// Maps each StmtSequence to the search data that was generated for it.
+  std::map<StmtSequence, StmtData> HashedStmts;
+};
+
+} // end namespace clang
+
+#endif // LLVM_CLANG_AST_CLONEDETECTION_H

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Re: [PATCH] D20795: Added basic capabilities to detect source code clones.

Reply via email to