teemperor created this revision.
teemperor added reviewers: v.g.vassilev, NoQ, zaks.anna.
teemperor added a subscriber: cfe-commits.

https://reviews.llvm.org/D23418

Files:
  include/clang/Analysis/CloneDetection.h
  lib/Analysis/CloneDetection.cpp
  lib/StaticAnalyzer/Checkers/CloneChecker.cpp

Index: lib/StaticAnalyzer/Checkers/CloneChecker.cpp
===================================================================
--- lib/StaticAnalyzer/Checkers/CloneChecker.cpp
+++ lib/StaticAnalyzer/Checkers/CloneChecker.cpp
@@ -36,11 +36,9 @@
                                  AnalysisManager &Mgr, BugReporter &BR) const;
 
   /// \brief Reports all clones to the user.
-  void reportClones(SourceManager &SM, AnalysisManager &Mgr,
-                    int MinComplexity) const {
-
-    std::vector<CloneDetector::CloneGroup> CloneGroups;
-    CloneDetector.findClones(CloneGroups, MinComplexity);
+  void
+  reportClones(SourceManager &SM, AnalysisManager &Mgr,
+               const std::vector<CloneDetector::CloneGroup> &Clones) const {
 
     DiagnosticsEngine &DiagEngine = Mgr.getDiagnostic();
 
@@ -50,7 +48,7 @@
     unsigned NoteID = DiagEngine.getCustomDiagID(DiagnosticsEngine::Note,
                                                  "Related code clone is here.");
 
-    for (CloneDetector::CloneGroup &Group : CloneGroups) {
+    for (const CloneDetector::CloneGroup &Group : Clones) {
       // We group the clones by printing the first as a warning and all others
       // as a note.
       DiagEngine.Report(Group.Sequences.front().getStartLoc(), WarnID);
@@ -62,11 +60,37 @@
 
   /// \brief Reports only suspicious clones to the user along with informaton
   ///        that explain why they are suspicious.
-  void reportSuspiciousClones(SourceManager &SM, AnalysisManager &Mgr,
-                              int MinComplexity) const {
-
-    std::vector<CloneDetector::SuspiciousClonePair> Clones;
-    CloneDetector.findSuspiciousClones(Clones, MinComplexity);
+  void reportSuspiciousClones(
+      SourceManager &SM, AnalysisManager &Mgr,
+      const std::vector<CloneDetector::CloneGroup> &Clones) const {
+
+    std::vector<VariablePattern::SuspiciousClonePair> Pairs;
+
+    for (const CloneDetector::CloneGroup &Group : Clones) {
+      for (unsigned i = 0; i < Group.Sequences.size(); ++i) {
+        VariablePattern PatternA(Group.Sequences[i]);
+
+        for (unsigned j = i + 1; j < Group.Sequences.size(); ++j) {
+          VariablePattern PatternB(Group.Sequences[j]);
+
+          VariablePattern::SuspiciousClonePair ClonePair;
+          // For now, we only report clones which break the variable pattern
+          // just
+          // once because multiple differences in a pattern are an indicator
+          // that
+          // those differences are maybe intended (e.g. because it's actually
+          // a different algorithm).
+          // TODO: In very big clones even multiple variables can be unintended,
+          // so replacing this number with a percentage could better handle such
+          // cases. On the other hand it could increase the false-positive rate
+          // for all clones if the percentage is too high.
+          if (PatternA.getPatternDifferences(PatternB, &ClonePair) == 1) {
+            Pairs.push_back(ClonePair);
+            break;
+          }
+        }
+      }
+    }
 
     DiagnosticsEngine &DiagEngine = Mgr.getDiagnostic();
 
@@ -81,7 +105,7 @@
         DiagnosticsEngine::Note, "Or maybe you wanted to use %0 here in this "
                                  "similar piece of code?");
 
-    for (CloneDetector::SuspiciousClonePair &Pair : Clones) {
+    for (VariablePattern::SuspiciousClonePair &Pair : Pairs) {
       // The first clone always has a suggestion and we report it to the user
       // along with the place where the suggestion should be used.
       DiagEngine.Report(Pair.FirstClone.Location.getBegin(), WarnID)
@@ -118,6 +142,8 @@
   // At this point, every statement in the translation unit has been analyzed by
   // the CloneDetector. The only thing left to do is to report the found clones.
 
+  SourceManager &SM = BR.getSourceManager();
+
   int MinComplexity = Mgr.getAnalyzerOptions().getOptionAsInteger(
       "MinimumCloneComplexity", 10, this);
   assert(MinComplexity >= 0);
@@ -128,11 +154,24 @@
   bool ReportNormalClones = Mgr.getAnalyzerOptions().getBooleanOption(
       "ReportNormalClones", true, this);
 
+  std::vector<CloneDetector::CloneGroup> AllCloneGroups;
+  CloneDetector::PassList Passes = {
+      new MinComplexityPass(MinComplexity), new MinGroupSizePass(2),
+      new VerifyCloneHashesPass(), new OnlyLargestClonePass()};
+  CloneDetector.findClones(AllCloneGroups, Passes);
+
   if (ReportSuspiciousClones)
-    reportSuspiciousClones(BR.getSourceManager(), Mgr, MinComplexity);
+    reportSuspiciousClones(SM, Mgr, AllCloneGroups);
+
+  CloneDetector::PassList OnlyMatchingVariablePatterns = {
+      new MinGroupSizePass(2), new MatchingVariablePatternPass()};
+
+  std::vector<CloneDetector::CloneGroup> CloneGroups;
+  CloneDetector::applyPasses(AllCloneGroups, CloneGroups,
+                             OnlyMatchingVariablePatterns);
 
   if (ReportNormalClones)
-    reportClones(BR.getSourceManager(), Mgr, MinComplexity);
+    reportClones(SM, Mgr, CloneGroups);
 }
 
 //===----------------------------------------------------------------------===//
Index: lib/Analysis/CloneDetection.cpp
===================================================================
--- lib/Analysis/CloneDetection.cpp
+++ lib/Analysis/CloneDetection.cpp
@@ -80,164 +80,6 @@
 SourceLocation StmtSequence::getEndLoc() const { return back()->getLocEnd(); }
 
 namespace {
-
-/// \brief Analyzes the pattern of the referenced variables in a statement.
-class VariablePattern {
-
-  /// \brief Describes an occurence of a variable reference in a statement.
-  struct VariableOccurence {
-    /// The index of the associated VarDecl in the Variables vector.
-    size_t KindID;
-    /// The location in the code where the variable was references.
-    SourceRange Location;
-
-    VariableOccurence(size_t KindID, SourceRange Location)
-        : KindID(KindID), Location(Location) {}
-  };
-
-  /// All occurences of referenced variables in the order of appearance.
-  std::vector<VariableOccurence> Occurences;
-  /// List of referenced variables in the order of appearance.
-  /// Every item in this list is unique.
-  std::vector<const VarDecl *> Variables;
-
-  /// \brief Adds a new variable referenced to this pattern.
-  /// \param VarDecl The declaration of the variable that is referenced.
-  /// \param Location The SourceRange where this variable is referenced.
-  void addVariableOccurence(const VarDecl *VarDecl, SourceRange Location) {
-    // First check if we already reference this variable
-    for (size_t KindIndex = 0; KindIndex < Variables.size(); ++KindIndex) {
-      if (Variables[KindIndex] == VarDecl) {
-        // If yes, add a new occurence that points to the existing entry in
-        // the Variables vector.
-        Occurences.emplace_back(KindIndex, Location);
-        return;
-      }
-    }
-    // If this variable wasn't already referenced, add it to the list of
-    // referenced variables and add a occurence that points to this new entry.
-    Occurences.emplace_back(Variables.size(), Location);
-    Variables.push_back(VarDecl);
-  }
-
-  /// \brief Adds each referenced variable from the given statement.
-  void addVariables(const Stmt *S) {
-    // Sometimes we get a nullptr (such as from IfStmts which often have nullptr
-    // children). We skip such statements as they don't reference any
-    // variables.
-    if (!S)
-      return;
-
-    // Check if S is a reference to a variable. If yes, add it to the pattern.
-    if (auto D = dyn_cast<DeclRefExpr>(S)) {
-      if (auto VD = dyn_cast<VarDecl>(D->getDecl()->getCanonicalDecl()))
-        addVariableOccurence(VD, D->getSourceRange());
-    }
-
-    // Recursively check all children of the given statement.
-    for (const Stmt *Child : S->children()) {
-      addVariables(Child);
-    }
-  }
-
-public:
-  /// \brief Creates an VariablePattern object with information about the given
-  ///        StmtSequence.
-  VariablePattern(const StmtSequence &Sequence) {
-    for (const Stmt *S : Sequence)
-      addVariables(S);
-  }
-
-  /// \brief Compares this pattern with the given one.
-  /// \param Other The given VariablePattern to compare with.
-  /// \param FirstMismatch Output parameter that will be filled with information
-  ///        about the first difference between the two patterns. This parameter
-  ///        can be a nullptr, in which case it will be ignored.
-  /// \return Returns the number of differences between the pattern this object
-  ///         is following and the given VariablePattern.
-  ///
-  /// For example, the following statements all have the same pattern and this
-  /// function would return zero:
-  ///
-  ///   if (a < b) return a; return b;
-  ///   if (x < y) return x; return y;
-  ///   if (u2 < u1) return u2; return u1;
-  ///
-  /// But the following statement has a different pattern (note the changed
-  /// variables in the return statements) and would have two differences when
-  /// compared with one of the statements above.
-  ///
-  ///   if (a < b) return b; return a;
-  ///
-  /// This function should only be called if the related statements of the given
-  /// pattern and the statements of this objects are clones of each other.
-  unsigned getPatternDifferences(
-      const VariablePattern &Other,
-      CloneDetector::SuspiciousClonePair *FirstMismatch = nullptr) {
-    unsigned NumberOfDifferences = 0;
-
-    assert(Other.Occurences.size() == Occurences.size());
-    for (unsigned i = 0; i < Occurences.size(); ++i) {
-      auto ThisOccurence = Occurences[i];
-      auto OtherOccurence = Other.Occurences[i];
-      if (ThisOccurence.KindID != OtherOccurence.KindID) {
-        ++NumberOfDifferences;
-
-        // If FirstMismatch is not a nullptr, we need to store information about
-        // the first difference between the two patterns.
-        if (FirstMismatch == nullptr)
-          continue;
-
-        // Only proceed if we just found the first difference as we only store
-        // information about the first difference.
-        if (NumberOfDifferences != 1)
-          continue;
-
-        const VarDecl *FirstSuggestion = nullptr;
-        // If there is a variable available in the list of referenced variables
-        // which wouldn't break the pattern if it is used in place of the
-        // current variable, we provide this variable as the suggested fix.
-        if (OtherOccurence.KindID < Variables.size())
-          FirstSuggestion = Variables[OtherOccurence.KindID];
-
-        // Store information about the first clone.
-        FirstMismatch->FirstClone =
-            CloneDetector::SuspiciousClonePair::SuspiciousClone(
-                Variables[ThisOccurence.KindID], ThisOccurence.Location,
-                FirstSuggestion);
-
-        // Same as above but with the other clone. We do this for both clones as
-        // we don't know which clone is the one containing the unintended
-        // pattern error.
-        const VarDecl *SecondSuggestion = nullptr;
-        if (ThisOccurence.KindID < Other.Variables.size())
-          SecondSuggestion = Other.Variables[ThisOccurence.KindID];
-
-        // Store information about the second clone.
-        FirstMismatch->SecondClone =
-            CloneDetector::SuspiciousClonePair::SuspiciousClone(
-                Variables[ThisOccurence.KindID], OtherOccurence.Location,
-                SecondSuggestion);
-
-        // SuspiciousClonePair guarantees that the first clone always has a
-        // suggested variable associated with it. As we know that one of the two
-        // clones in the pair always has suggestion, we swap the two clones
-        // in case the first clone has no suggested variable which means that
-        // the second clone has a suggested variable and should be first.
-        if (!FirstMismatch->FirstClone.Suggestion)
-          std::swap(FirstMismatch->FirstClone, FirstMismatch->SecondClone);
-
-        // This ensures that we always have at least one suggestion in a pair.
-        assert(FirstMismatch->FirstClone.Suggestion);
-      }
-    }
-
-    return NumberOfDifferences;
-  }
-};
-}
-
-namespace {
 /// \brief Collects the data of a single Stmt.
 ///
 /// This class defines what a code clone is: If it collects for two statements
@@ -529,6 +371,73 @@
   Sequences.push_back(std::make_pair(Signature, S));
 }
 
+void CloneDetector::applyPasses(const std::vector<CloneGroup> &CloneGroups,
+                                std::vector<CloneDetector::CloneGroup> &Result,
+                                PassList &Passes) {
+  for (auto &HashGroup : CloneGroups) {
+    // Contains all indexes in HashGroup that were already added to a
+    // CloneGroup.
+    std::vector<char> Indexes;
+    Indexes.resize(HashGroup.Sequences.size());
+
+    for (unsigned i = 0; i < HashGroup.Sequences.size(); ++i) {
+      // Skip indexes that are already part of a CloneGroup.
+      if (Indexes[i])
+        continue;
+
+      // Pick the first unhandled StmtSequence and consider it as the beginning
+      // of a new CloneGroup for now.
+      // We don't add i to Indexes because we never iterate back.
+      StmtSequence Prototype = HashGroup.Sequences[i];
+      CloneDetector::CloneGroup PotentialGroup(Prototype, HashGroup.Signature);
+      ++Indexes[i];
+
+      if (!std::all_of(Passes.begin(), Passes.end(),
+                       [&Prototype, &PotentialGroup](Pass *Pass) {
+                         return Pass->passPrototype(Prototype, PotentialGroup);
+                       }))
+        continue;
+
+      // Check all following StmtSequences for clones.
+      for (unsigned j = i + 1; j < HashGroup.Sequences.size(); ++j) {
+        // Skip indexes that are already part of a CloneGroup.
+        if (Indexes[j])
+          continue;
+
+        // If a following StmtSequence belongs to our CloneGroup, we add it to
+        // it.
+        const StmtSequence &Candidate = HashGroup.Sequences[j];
+
+        if (!std::all_of(Passes.begin(), Passes.end(),
+                         [&Prototype, &Candidate](Pass *Pass) {
+                           return Pass->passCandidate(Prototype, Candidate);
+                         }))
+          continue;
+
+        PotentialGroup.Sequences.push_back(Candidate);
+        // Make sure we never visit this StmtSequence again.
+        ++Indexes[j];
+      }
+
+      if (!std::all_of(Passes.begin(), Passes.end(),
+                       [&PotentialGroup](Pass *Pass) {
+                         return Pass->passGroup(PotentialGroup);
+                       }))
+        continue;
+
+      // Otherwise, add it to the result and continue searching for more groups.
+      Result.push_back(PotentialGroup);
+    }
+
+    assert(std::all_of(Indexes.begin(), Indexes.end(),
+                       [](char c) { return c == 1; }));
+  }
+
+  for (Pass *P : Passes) {
+    P->processResult(Result);
+  }
+}
+
 namespace {
 /// \brief Returns true if and only if \p Stmt contains at least one other
 /// sequence in the \p Group.
@@ -559,11 +468,69 @@
 }
 } // end anonymous namespace
 
+void CloneDetector::findClones(std::vector<CloneGroup> &Result,
+                               PassList &Passes) {
+  // Shortcut and necessary for the for-loop later in this function.
+  if (Sequences.empty())
+    return;
+
+  std::stable_sort(Sequences.begin(), Sequences.end(),
+                   [](std::pair<CloneSignature, StmtSequence> LHS,
+                      std::pair<CloneSignature, StmtSequence> RHS) {
+                     return LHS.first.Hash < RHS.first.Hash;
+                   });
+
+  std::vector<CloneGroup> CloneGroups;
+
+  // Check for each CloneSignature if its successor has the same hash value.
+  // We don't check the last CloneSignature as it has no successor.
+  // Note: The 'size - 1 ' in the condition is safe because we check for an
+  // empty Sequences vector at the beginning of this function.
+  for (unsigned i = 0; i < Sequences.size() - 1; ++i) {
+    const auto Current = Sequences[i];
+    const auto Next = Sequences[i + 1];
+
+    if (Current.first.Hash != Next.first.Hash)
+      continue;
+
+    // It's likely that we just found an sequence of CloneSignatures that
+    // represent a CloneGroup, so we create a new group and start checking and
+    // adding the CloneSignatures in this sequence.
+    CloneGroup Group;
+    Group.Signature = Current.first;
+
+    for (; i < Sequences.size(); ++i) {
+      const auto &Signature = Sequences[i];
+
+      // A different hash value means we have reached the end of the sequence.
+      if (Current.first.Hash != Signature.first.Hash) {
+        // The current Signature could be the start of a new CloneGroup. So we
+        // decrement i so that we visit it again in the outer loop.
+        // Note: i can never be 0 at this point because we are just comparing
+        // the hash of the Current CloneSignature with itself in the 'if' above.
+        assert(i != 0);
+        --i;
+        break;
+      }
+
+      Group.Sequences.push_back(Signature.second);
+    }
+
+    if (!std::all_of(Passes.begin(), Passes.end(), [&Group](Pass *Pass) {
+          return Pass->passHashGroup(Group);
+        }))
+      continue;
+
+    CloneGroups.push_back(Group);
+  }
+
+  applyPasses(CloneGroups, Result, Passes);
+}
+
 namespace {
 /// \brief Wrapper around FoldingSetNodeID that it can be used as the template
 ///        argument of the StmtDataCollector.
 class FoldingSetWrapper {
-
   llvm::FoldingSetNodeID &FS;
 
 public:
@@ -610,144 +577,134 @@
   return DataLHS == DataRHS;
 }
 
-/// \brief Finds all actual clone groups in a single group of presumed clones.
-/// \param Result Output parameter to which all found groups are added.
-/// \param Group A group of presumed clones. The clones are allowed to have a
-///              different variable pattern and may not be actual clones of each
-///              other.
-/// \param CheckVariablePatterns If true, every clone in a group that was added
-///              to the output follows the same variable pattern as the other
-///              clones in its group.
-static void createCloneGroups(std::vector<CloneDetector::CloneGroup> &Result,
-                              const CloneDetector::CloneGroup &Group,
-                              bool CheckVariablePattern) {
-  // We remove the Sequences one by one, so a list is more appropriate.
-  std::list<StmtSequence> UnassignedSequences(Group.Sequences.begin(),
-                                              Group.Sequences.end());
-
-  // Search for clones as long as there could be clones in UnassignedSequences.
-  while (UnassignedSequences.size() > 1) {
-
-    // Pick the first Sequence as a protoype for a new clone group.
-    StmtSequence Prototype = UnassignedSequences.front();
-    UnassignedSequences.pop_front();
-
-    CloneDetector::CloneGroup FilteredGroup(Prototype, Group.Signature);
-
-    // Analyze the variable pattern of the prototype. Every other StmtSequence
-    // needs to have the same pattern to get into the new clone group.
-    VariablePattern PrototypeFeatures(Prototype);
-
-    // Search all remaining StmtSequences for an identical variable pattern
-    // and assign them to our new clone group.
-    auto I = UnassignedSequences.begin(), E = UnassignedSequences.end();
-    while (I != E) {
-      // If the sequence doesn't fit to the prototype, we have encountered
-      // an unintended hash code collision and we skip it.
-      if (!areSequencesClones(Prototype, *I)) {
-        ++I;
-        continue;
-      }
-
-      // If we weren't asked to check for a matching variable pattern in clone
-      // groups we can add the sequence now to the new clone group.
-      // If we were asked to check for matching variable pattern, we first have
-      // to check that there are no differences between the two patterns and
-      // only proceed if they match.
-      if (!CheckVariablePattern ||
-          VariablePattern(*I).getPatternDifferences(PrototypeFeatures) == 0) {
-        FilteredGroup.Sequences.push_back(*I);
-        I = UnassignedSequences.erase(I);
-        continue;
-      }
-
-      // We didn't found a matching variable pattern, so we continue with the
-      // next sequence.
-      ++I;
+void VariablePattern::addVariableOccurence(const VarDecl *VarDecl,
+                                           SourceRange Location) {
+  // First check if we already reference this variable
+  for (size_t KindIndex = 0; KindIndex < Variables.size(); ++KindIndex) {
+    if (Variables[KindIndex] == VarDecl) {
+      // If yes, add a new occurence that points to the existing entry in
+      // the Variables vector.
+      Occurences.emplace_back(KindIndex, Location);
+      return;
     }
-
-    // Add a valid clone group to the list of found clone groups.
-    if (!FilteredGroup.isValid())
-      continue;
-    Result.push_back(FilteredGroup);
   }
+  // If this variable wasn't already referenced, add it to the list of
+  // referenced variables and add a occurence that points to this new entry.
+  Occurences.emplace_back(Variables.size(), Location);
+  Variables.push_back(VarDecl);
 }
 
-void CloneDetector::findClones(std::vector<CloneGroup> &Result,
-                               unsigned MinGroupComplexity,
-                               bool CheckPatterns) {
-  // A shortcut (and necessary for the for-loop later in this function).
-  if (Sequences.empty())
+void VariablePattern::addVariables(const Stmt *S) {
+  // Sometimes we get a nullptr (such as from IfStmts which often have nullptr
+  // children). We skip such statements as they don't reference any
+  // variables.
+  if (!S)
     return;
 
-  // We need to search for groups of StmtSequences with the same hash code to
-  // create our initial clone groups. By sorting all known StmtSequences by
-  // their hash value we make sure that StmtSequences with the same hash code
-  // are grouped together in the Sequences vector.
-  // Note: We stable sort here because the StmtSequences are added in the order
-  // in which they appear in the source file. We want to preserve that order
-  // because we also want to report them in that order in the CloneChecker.
-  std::stable_sort(Sequences.begin(), Sequences.end(),
-                   [](std::pair<CloneSignature, StmtSequence> LHS,
-                      std::pair<CloneSignature, StmtSequence> RHS) {
-                     return LHS.first.Hash < RHS.first.Hash;
-                   });
+  // Check if S is a reference to a variable. If yes, add it to the pattern.
+  if (auto D = dyn_cast<DeclRefExpr>(S)) {
+    if (auto VD = dyn_cast<VarDecl>(D->getDecl()->getCanonicalDecl()))
+      addVariableOccurence(VD, D->getSourceRange());
+  }
 
-  std::vector<CloneGroup> CloneGroups;
+  // Recursively check all children of the given statement.
+  for (const Stmt *Child : S->children()) {
+    addVariables(Child);
+  }
+}
 
-  // Check for each CloneSignature if its successor has the same hash value.
-  // We don't check the last CloneSignature as it has no successor.
-  // Note: The 'size - 1' in the condition is safe because we check for an empty
-  // Sequences vector at the beginning of this function.
-  for (unsigned i = 0; i < Sequences.size() - 1; ++i) {
-    const auto Current = Sequences[i];
-    const auto Next = Sequences[i + 1];
+unsigned
+VariablePattern::getPatternDifferences(const VariablePattern &Other,
+                                       SuspiciousClonePair *FirstMismatch) {
+  unsigned NumberOfDifferences = 0;
+
+  assert(Other.Occurences.size() == Occurences.size());
+  for (unsigned i = 0; i < Occurences.size(); ++i) {
+    auto ThisOccurence = Occurences[i];
+    auto OtherOccurence = Other.Occurences[i];
+    if (ThisOccurence.KindID != OtherOccurence.KindID) {
+      ++NumberOfDifferences;
+
+      // If FirstMismatch is not a nullptr, we need to store information about
+      // the first difference between the two patterns.
+      if (FirstMismatch == nullptr)
+        continue;
 
-    if (Current.first.Hash != Next.first.Hash)
-      continue;
+      // Only proceed if we just found the first difference as we only store
+      // information about the first difference.
+      if (NumberOfDifferences != 1)
+        continue;
 
-    // It's likely that we just found an sequence of CloneSignatures that
-    // represent a CloneGroup, so we create a new group and start checking and
-    // adding the CloneSignatures in this sequence.
-    CloneGroup Group;
-    Group.Signature = Current.first;
+      const VarDecl *FirstSuggestion = nullptr;
+      // If there is a variable available in the list of referenced variables
+      // which wouldn't break the pattern if it is used in place of the
+      // current variable, we provide this variable as the suggested fix.
+      if (OtherOccurence.KindID < Variables.size())
+        FirstSuggestion = Variables[OtherOccurence.KindID];
+
+      // Store information about the first clone.
+      FirstMismatch->FirstClone = SuspiciousClonePair::SuspiciousClone(
+          Variables[ThisOccurence.KindID], ThisOccurence.Location,
+          FirstSuggestion);
+
+      // Same as above but with the other clone. We do this for both clones as
+      // we don't know which clone is the one containing the unintended
+      // pattern error.
+      const VarDecl *SecondSuggestion = nullptr;
+      if (ThisOccurence.KindID < Other.Variables.size())
+        SecondSuggestion = Other.Variables[ThisOccurence.KindID];
+
+      // Store information about the second clone.
+      FirstMismatch->SecondClone = SuspiciousClonePair::SuspiciousClone(
+          Variables[ThisOccurence.KindID], OtherOccurence.Location,
+          SecondSuggestion);
+
+      // SuspiciousClonePair guarantees that the first clone always has a
+      // suggested variable associated with it. As we know that one of the two
+      // clones in the pair always has suggestion, we swap the two clones
+      // in case the first clone has no suggested variable which means that
+      // the second clone has a suggested variable and should be first.
+      if (!FirstMismatch->FirstClone.Suggestion)
+        std::swap(FirstMismatch->FirstClone, FirstMismatch->SecondClone);
+
+      // This ensures that we always have at least one suggestion in a pair.
+      assert(FirstMismatch->FirstClone.Suggestion);
+    }
+  }
 
-    for (; i < Sequences.size(); ++i) {
-      const auto &Signature = Sequences[i];
+  return NumberOfDifferences;
+}
 
-      // A different hash value means we have reached the end of the sequence.
-      if (Current.first.Hash != Signature.first.Hash) {
-        // The current Signature could be the start of a new CloneGroup. So we
-        // decrement i so that we visit it again in the outer loop.
-        // Note: i can never be 0 at this point because we are just comparing
-        // the hash of the Current CloneSignature with itself in the 'if' above.
-        assert(i != 0);
-        --i;
-        break;
-      }
+bool CloneDetector::Pass::passHashGroup(
+    const CloneDetector::CloneGroup &HashGroup) {
+  return true;
+}
 
-      // Skip CloneSignatures that won't pass the complexity requirement.
-      if (Signature.first.Complexity < MinGroupComplexity)
-        continue;
+bool CloneDetector::Pass::passPrototype(
+    const StmtSequence &Prototype,
+    const CloneDetector::CloneGroup &PrototypeGroup) {
+  return true;
+}
 
-      Group.Sequences.push_back(Signature.second);
-    }
+bool CloneDetector::Pass::passCandidate(const StmtSequence &Prototype,
+                                        const StmtSequence &Candidate) {
+  return true;
+}
 
-    // There is a chance that we haven't found more than two fitting
-    // CloneSignature because not enough CloneSignatures passed the complexity
-    // requirement. As a CloneGroup with less than two members makes no sense,
-    // we ignore this CloneGroup and won't add it to the result.
-    if (!Group.isValid())
-      continue;
+bool CloneDetector::Pass::passGroup(const CloneDetector::CloneGroup &Group) {
+  return true;
+}
 
-    CloneGroups.push_back(Group);
-  }
+void CloneDetector::Pass::processResult(
+    std::vector<CloneDetector::CloneGroup> &Result) {}
 
-  // Add every valid clone group that fulfills the complexity requirement.
-  for (const CloneGroup &Group : CloneGroups) {
-    createCloneGroups(Result, Group, CheckPatterns);
-  }
+bool VerifyCloneHashesPass::passCandidate(const StmtSequence &Prototype,
+                                          const StmtSequence &Candidate) {
+  return areSequencesClones(Prototype, Candidate);
+}
 
+void OnlyLargestClonePass::processResult(
+    std::vector<CloneDetector::CloneGroup> &Result) {
   std::vector<unsigned> IndexesToRemove;
 
   // Compare every group in the result with the rest. If one groups contains
@@ -774,37 +731,3 @@
     Result.erase(Result.begin() + *I);
   }
 }
-
-void CloneDetector::findSuspiciousClones(
-    std::vector<CloneDetector::SuspiciousClonePair> &Result,
-    unsigned MinGroupComplexity) {
-  std::vector<CloneGroup> Clones;
-  // Reuse the normal search for clones but specify that the clone groups don't
-  // need to have a common referenced variable pattern so that we can manually
-  // search for the kind of pattern errors this function is supposed to find.
-  findClones(Clones, MinGroupComplexity, false);
-
-  for (const CloneGroup &Group : Clones) {
-    for (unsigned i = 0; i < Group.Sequences.size(); ++i) {
-      VariablePattern PatternA(Group.Sequences[i]);
-
-      for (unsigned j = i + 1; j < Group.Sequences.size(); ++j) {
-        VariablePattern PatternB(Group.Sequences[j]);
-
-        CloneDetector::SuspiciousClonePair ClonePair;
-        // For now, we only report clones which break the variable pattern just
-        // once because multiple differences in a pattern are an indicator that
-        // those differences are maybe intended (e.g. because it's actually
-        // a different algorithm).
-        // TODO: In very big clones even multiple variables can be unintended,
-        // so replacing this number with a percentage could better handle such
-        // cases. On the other hand it could increase the false-positive rate
-        // for all clones if the percentage is too high.
-        if (PatternA.getPatternDifferences(PatternB, &ClonePair) == 1) {
-          Result.push_back(ClonePair);
-          break;
-        }
-      }
-    }
-  }
-}
Index: include/clang/Analysis/CloneDetection.h
===================================================================
--- include/clang/Analysis/CloneDetection.h
+++ include/clang/Analysis/CloneDetection.h
@@ -17,7 +17,6 @@
 
 #include "clang/Basic/SourceLocation.h"
 #include "llvm/ADT/Hashing.h"
-#include "llvm/ADT/StringMap.h"
 
 #include <vector>
 
@@ -158,6 +157,7 @@
 /// (e.g. function bodies). Other clones (e.g. cloned comments or declarations)
 /// are not supported.
 class CloneDetector {
+
 public:
   typedef unsigned DataPiece;
 
@@ -194,8 +194,7 @@
   };
 
   /// Holds group of StmtSequences that are clones of each other and the
-  /// complexity value (see CloneSignature::Complexity) that all stored
-  /// StmtSequences have in common.
+  /// CloneSignature that all clones have in common.
   struct CloneGroup {
     std::vector<StmtSequence> Sequences;
     CloneSignature Signature;
@@ -206,13 +205,6 @@
         : Signature(Signature) {
       Sequences.push_back(Seq);
     }
-
-    /// \brief Returns false if and only if this group should be skipped when
-    ///        searching for clones.
-    bool isValid() const {
-      // A clone group with only one member makes no sense, so we skip them.
-      return Sequences.size() > 1;
-    }
   };
 
   /// \brief Generates and stores search data for all statements in the body of
@@ -222,17 +214,129 @@
   /// \brief Stores the CloneSignature to allow future querying.
   void add(const StmtSequence &S, const CloneSignature &Signature);
 
-  /// \brief Searches the provided statements for clones.
+  /// \brief Represents a way to filter clones from a list of clone groups.
+  ///
+  /// A reoccuring task in the CloneDetector is filtering out clones that don't
+  /// have certain properties. This filtering is done in a pass architecture
+  /// in which each pass is supposed to remove a certain kind of clones.
+  /// Usually, passes are grouped together in a PassList and are applied one
+  /// after another until all undesireable clones are removed.
+  ///
+  /// To create a new pass, inherit from this class and overwrite the respective
+  /// virtual functions.
+  class Pass {
+  public:
+    virtual ~Pass() {}
+
+    /// \brief Allows skipping the processing of a given hash group.
+    /// \param HashGroup The given hash group.
+    /// \return Return true if the CloneDetector should start creating actual
+    ///         clone groups from the given hash group.
+    virtual bool passHashGroup(const CloneGroup &HashGroup);
+
+    /// \brief Allows skipping the creation of clone groups from a given
+    ///        prototype StmtSequence.
+    /// \param Prototype The given prototype StmtSequence.
+    /// \param PrototypeGroup The clone group to which the Prototype belongs.
+    /// \return Return true if the CloneDetector should search for candidates
+    ///         that will fit into the given prototype group.
+    virtual bool passPrototype(const StmtSequence &Prototype,
+                               const CloneGroup &PrototypeGroup);
+
+    /// \brief Decides if a StmtSequence belongs into a certain clone group.
+    /// \param Prototype The prototype that
+    /// \param Candidate
+    /// \return Return true if the candidate fits to the prototype.
+    virtual bool passCandidate(const StmtSequence &Prototype,
+                               const StmtSequence &Candidate);
+
+    /// \brief Decides if a given clone group should be added to the result.
+    /// \param Group The given clone group.
+    /// \return Return true if the given group should be included in the result.
+    virtual bool passGroup(const CloneGroup &Group);
+
+    /// \brief Can process and modify the clone groups before they are returned.
+    /// \param Result The list of clone groups.
+    virtual void processResult(std::vector<CloneGroup> &Result);
+  };
+
+  /// \brief Stores a list of passes.
   ///
-  /// \param Result Output parameter that is filled with a list of found
-  ///               clone groups. Each group contains multiple StmtSequences
-  ///               that were identified to be clones of each other.
-  /// \param MinGroupComplexity Only return clones which have at least this
-  ///                           complexity value.
-  /// \param CheckPatterns Returns only clone groups in which the referenced
-  ///                      variables follow the same pattern.
-  void findClones(std::vector<CloneGroup> &Result, unsigned MinGroupComplexity,
-                  bool CheckPatterns = true);
+  /// This class also takes over the memory ownership of all given passes.
+  class PassList {
+    std::vector<Pass *> Passes;
+
+  public:
+    PassList(std::initializer_list<Pass *> IL) : Passes(IL.begin(), IL.end()) {}
+    ~PassList() {
+      for (Pass *P : Passes) {
+        delete P;
+      }
+    }
+
+    // Non-copyable because we free all passes in the destructor.
+    PassList(const PassList &other) = delete;
+    PassList &operator=(const PassList &) = delete;
+
+    std::vector<Pass *>::iterator begin() { return Passes.begin(); }
+    std::vector<Pass *>::iterator end() { return Passes.end(); }
+  };
+
+  /// \brief Generates hash groups and applies the given passes to them.
+  /// \param Result Output parameter to which all created clone groups are
+  ///               added.
+  /// \param Passes The passes that should be used to filter the hash groups.
+  void findClones(std::vector<CloneGroup> &Result, PassList &Passes);
+
+  /// \brief Applies the given passes on a list of clone groups.
+  /// \param CloneGroups The list of clone groups.
+  /// \param Result Output parameter to which all created clone groups are
+  ///               added.
+  /// \param Passes The passes that should be applied.
+  static void applyPasses(const std::vector<CloneGroup> &CloneGroups,
+                          std::vector<CloneGroup> &Result, PassList &Passes);
+
+private:
+  /// Stores all encountered StmtSequences alongside their CloneSignature.
+  std::vector<std::pair<CloneSignature, StmtSequence>> Sequences;
+};
+
+/// \brief Analyzes the pattern of the referenced variables in a statement.
+class VariablePattern {
+
+  /// \brief Describes an occurence of a variable reference in a statement.
+  struct VariableOccurence {
+    /// The index of the associated VarDecl in the Variables vector.
+    size_t KindID;
+    /// The location in the code where the variable was references.
+    SourceRange Location;
+
+    VariableOccurence(size_t KindID, SourceRange Location)
+        : KindID(KindID), Location(Location) {}
+  };
+
+  /// All occurences of referenced variables in the order of appearance.
+  std::vector<VariableOccurence> Occurences;
+  /// List of referenced variables in the order of appearance.
+  /// Every item in this list is unique.
+  std::vector<const VarDecl *> Variables;
+
+  /// \brief Adds a new variable referenced to this pattern.
+  /// \param VarDecl The declaration of the variable that is referenced.
+  /// \param Location The SourceRange where this variable is referenced.
+  void addVariableOccurence(const VarDecl *VarDecl, SourceRange Location);
+
+  /// \brief Adds each referenced variable from the given statement.
+  void addVariables(const Stmt *S);
+
+public:
+  /// \brief Creates an VariablePattern object with information about the given
+  ///        StmtSequence.
+  VariablePattern(const StmtSequence &Sequence) {
+    for (const Stmt *S : Sequence)
+      addVariables(S);
+  }
+  VariablePattern() {}
 
   /// \brief Describes two clones that reference their variables in a different
   ///        pattern which could indicate a programming error.
@@ -259,17 +363,110 @@
     SuspiciousClone SecondClone;
   };
 
-  /// \brief Searches the provided statements for pairs of clones that don't
-  ///        follow the same pattern when referencing variables.
-  /// \param Result Output parameter that will contain the clone pairs.
-  /// \param MinGroupComplexity Only clone pairs in which the clones have at
-  ///                           least this complexity value.
-  void findSuspiciousClones(std::vector<SuspiciousClonePair> &Result,
-                            unsigned MinGroupComplexity);
+  /// \brief Compares this pattern with the given one.
+  /// \param Other The given VariablePattern to compare with.
+  /// \param FirstMismatch Output parameter that will be filled with information
+  ///        about the first difference between the two patterns. This parameter
+  ///        can be a nullptr, in which case it will be ignored.
+  /// \return Returns the number of differences between the pattern this object
+  ///         is following and the given VariablePattern.
+  ///
+  /// For example, the following statements all have the same pattern and this
+  /// function would return zero:
+  ///
+  ///   if (a < b) return a; return b;
+  ///   if (x < y) return x; return y;
+  ///   if (u2 < u1) return u2; return u1;
+  ///
+  /// But the following statement has a different pattern (note the changed
+  /// variables in the return statements) and would have two differences when
+  /// compared with one of the statements above.
+  ///
+  ///   if (a < b) return b; return a;
+  ///
+  /// This function should only be called if the related statements of the given
+  /// pattern and the statements of this objects are clones of each other.
+  unsigned getPatternDifferences(const VariablePattern &Other,
+                                 SuspiciousClonePair *FirstMismatch = nullptr);
+};
+
+/// \brief Filters out all clones that don't reference variables in the same
+///        pattern.
+class MatchingVariablePatternPass : public CloneDetector::Pass {
+  VariablePattern PrototypePattern;
 
-private:
-  /// Stores all encountered StmtSequences alongside their CloneSignature.
-  std::vector<std::pair<CloneSignature, StmtSequence>> Sequences;
+public:
+  virtual ~MatchingVariablePatternPass() {}
+
+  virtual bool
+  passPrototype(const StmtSequence &Prototype,
+                const CloneDetector::CloneGroup &PrototypeGroup) override {
+    PrototypePattern = VariablePattern(Prototype);
+    return true;
+  }
+
+  virtual bool passCandidate(const StmtSequence &Prototype,
+                             const StmtSequence &Candidate) override {
+    VariablePattern CandidatePattern(Candidate);
+    return PrototypePattern.getPatternDifferences(CandidatePattern) == 0;
+  }
+};
+
+/// \brief Filters all clone groups that don't have at least the given size.
+class MinGroupSizePass : public CloneDetector::Pass {
+  unsigned MinGroupSize;
+
+public:
+  virtual ~MinGroupSizePass() {}
+  MinGroupSizePass(unsigned MinGroupSize = 2) : MinGroupSize(MinGroupSize) {}
+
+  virtual bool
+  passHashGroup(const CloneDetector::CloneGroup &HashGroup) override {
+    return HashGroup.Sequences.size() >= MinGroupSize;
+  }
+
+  virtual bool
+  passGroup(const CloneDetector::CloneGroup &PrototypeGroup) override {
+    return PrototypeGroup.Sequences.size() >= MinGroupSize;
+  }
+};
+
+/// \brief Filters all clones that don't have at least the given complexity
+///        value.
+class MinComplexityPass : public CloneDetector::Pass {
+  unsigned MinComplexity;
+
+public:
+  virtual ~MinComplexityPass() {}
+  MinComplexityPass(unsigned MinComplexity = 10)
+      : MinComplexity(MinComplexity) {}
+
+  virtual bool
+  passPrototype(const StmtSequence &Prototype,
+                const CloneDetector::CloneGroup &PrototypeGroup) override {
+    return PrototypeGroup.Signature.Complexity >= MinComplexity;
+  }
+};
+
+/// \brief Filters all presumed clones from a hash group that aren't actually
+///        clones but landed in the hash group due to unintended hash code
+///        collisions.
+class VerifyCloneHashesPass : public CloneDetector::Pass {
+public:
+  virtual ~VerifyCloneHashesPass() {}
+
+  virtual bool passCandidate(const StmtSequence &Prototype,
+                             const StmtSequence &Candidate) override;
+};
+
+/// \brief This pass removes all clone groups which are a full subset of another
+///        found clone group.
+class OnlyLargestClonePass : public CloneDetector::Pass {
+public:
+  virtual ~OnlyLargestClonePass() {}
+
+  virtual void
+  processResult(std::vector<CloneDetector::CloneGroup> &Result) override;
 };
 
 } // end namespace clang
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to