teemperor retitled this revision from "[analyzer] Added a pass architecture to 
the CloneDetector" to "[analyzer] Added a reusable constraint system to the 
CloneDetector".
teemperor updated the summary for this revision.
teemperor updated this revision to Diff 68680.
teemperor added a comment.

- Rebased patch.
- Renamed passes to constraints.


https://reviews.llvm.org/D23418

Files:
  include/clang/Analysis/CloneDetection.h
  lib/Analysis/CloneDetection.cpp
  lib/StaticAnalyzer/Checkers/CloneChecker.cpp

Index: lib/StaticAnalyzer/Checkers/CloneChecker.cpp
===================================================================
--- lib/StaticAnalyzer/Checkers/CloneChecker.cpp
+++ lib/StaticAnalyzer/Checkers/CloneChecker.cpp
@@ -37,12 +37,13 @@
 
   /// \brief Reports all clones to the user.
   void reportClones(SourceManager &SM, AnalysisManager &Mgr,
-                    int MinComplexity) const;
+                    const std::vector<CloneDetector::CloneGroup> &Clones) const;
 
   /// \brief Reports only suspicious clones to the user along with informaton
   ///        that explain why they are suspicious.
-  void reportSuspiciousClones(SourceManager &SM, AnalysisManager &Mgr,
-                              int MinComplexity) const;
+  void reportSuspiciousClones(
+      SourceManager &SM, AnalysisManager &Mgr,
+      const std::vector<CloneDetector::CloneGroup> &Clones) const;
 };
 } // end anonymous namespace
 
@@ -59,6 +60,8 @@
   // At this point, every statement in the translation unit has been analyzed by
   // the CloneDetector. The only thing left to do is to report the found clones.
 
+  SourceManager &SM = BR.getSourceManager();
+
   int MinComplexity = Mgr.getAnalyzerOptions().getOptionAsInteger(
       "MinimumCloneComplexity", 10, this);
   assert(MinComplexity >= 0);
@@ -69,18 +72,50 @@
   bool ReportNormalClones = Mgr.getAnalyzerOptions().getBooleanOption(
       "ReportNormalClones", true, this);
 
-  if (ReportSuspiciousClones)
-    reportSuspiciousClones(BR.getSourceManager(), Mgr, MinComplexity);
+  // Let the CloneDetector create a list of clones from all the analyzed
+  // statements. We don't filter for matching variable patterns at this point
+  // because reportSuspiciousClones() wants to search them for errors.
+  std::vector<CloneDetector::CloneGroup> AllCloneGroups;
+
+  CloneDetector::ConstraintList GenericConstraints = {
+      // Remove all clones that are below the user-specified complexity value.
+      new MinComplexityConstraint(MinComplexity),
+      // We are ok with at least two clones in a group.
+      new MinGroupSizeConstraint(2),
+      // Remove all false-positive clones due to unintended collisions in our
+      // hash function.
+      new VerifyCloneHashesConstraint(),
+      // If one clone group contains another clone group, only report the bigger
+      // one. It's obvious to the user that if the child statements of a clone
+      // are also clones.
+      new OnlyLargestCloneConstraint()};
+
+  Detector.findClones(AllCloneGroups, GenericConstraints);
 
-  if (ReportNormalClones)
-    reportClones(BR.getSourceManager(), Mgr, MinComplexity);
-}
+  if (ReportSuspiciousClones)
+    reportSuspiciousClones(SM, Mgr, AllCloneGroups);
 
-void CloneChecker::reportClones(SourceManager &SM, AnalysisManager &Mgr,
-                                int MinComplexity) const {
+  // We are done for this translation unit unless we also need to report normal
+  // clones.
+  if (!ReportNormalClones)
+    return;
 
+  // Now that the suspicious clone detector has checked for pattern errors,
+  // we also filter all clones who don't have matching patterns
   std::vector<CloneDetector::CloneGroup> CloneGroups;
-  Detector.findClones(CloneGroups, MinComplexity);
+
+  CloneDetector::ConstraintList OnlyMatchingVariablePatterns = {
+      new MinGroupSizeConstraint(2), new MatchingVariablePatternConstraint()};
+
+  CloneDetector::constrainClones(AllCloneGroups, CloneGroups,
+                                 OnlyMatchingVariablePatterns);
+
+  reportClones(SM, Mgr, CloneGroups);
+}
+
+void CloneChecker::reportClones(
+    SourceManager &SM, AnalysisManager &Mgr,
+    const std::vector<CloneDetector::CloneGroup> &Clones) const {
 
   DiagnosticsEngine &DiagEngine = Mgr.getDiagnostic();
 
@@ -90,7 +125,7 @@
   unsigned NoteID = DiagEngine.getCustomDiagID(DiagnosticsEngine::Note,
                                                "Related code clone is here.");
 
-  for (CloneDetector::CloneGroup &Group : CloneGroups) {
+  for (const CloneDetector::CloneGroup &Group : Clones) {
     // We group the clones by printing the first as a warning and all others
     // as a note.
     DiagEngine.Report(Group.Sequences.front().getStartLoc(), WarnID);
@@ -100,12 +135,35 @@
   }
 }
 
-void CloneChecker::reportSuspiciousClones(SourceManager &SM,
-                                          AnalysisManager &Mgr,
-                                          int MinComplexity) const {
-
-  std::vector<CloneDetector::SuspiciousClonePair> Clones;
-  Detector.findSuspiciousClones(Clones, MinComplexity);
+void CloneChecker::reportSuspiciousClones(
+    SourceManager &SM, AnalysisManager &Mgr,
+    const std::vector<CloneDetector::CloneGroup> &Clones) const {
+
+  std::vector<VariablePattern::SuspiciousClonePair> Pairs;
+
+  for (const CloneDetector::CloneGroup &Group : Clones) {
+    for (unsigned i = 0; i < Group.Sequences.size(); ++i) {
+      VariablePattern PatternA(Group.Sequences[i]);
+
+      for (unsigned j = i + 1; j < Group.Sequences.size(); ++j) {
+        VariablePattern PatternB(Group.Sequences[j]);
+
+        VariablePattern::SuspiciousClonePair ClonePair;
+        // For now, we only report clones which break the variable pattern just
+        // once because multiple differences in a pattern are an indicator that
+        // those differences are maybe intended (e.g. because it's actually a
+        // different algorithm).
+        // TODO: In very big clones even multiple variables can be unintended,
+        // so replacing this number with a percentage could better handle such
+        // cases. On the other hand it could increase the false-positive rate
+        // for all clones if the percentage is too high.
+        if (PatternA.getPatternDifferences(PatternB, &ClonePair) == 1) {
+          Pairs.push_back(ClonePair);
+          break;
+        }
+      }
+    }
+  }
 
   DiagnosticsEngine &DiagEngine = Mgr.getDiagnostic();
 
@@ -122,7 +180,7 @@
                                "variable in a similar piece of code; did you "
                                "mean to use %0?");
 
-  for (CloneDetector::SuspiciousClonePair &Pair : Clones) {
+  for (VariablePattern::SuspiciousClonePair &Pair : Pairs) {
     // The first clone always has a suggestion and we report it to the user
     // along with the place where the suggestion should be used.
     DiagEngine.Report(Pair.FirstCloneInfo.VarRange.getBegin(),
Index: lib/Analysis/CloneDetection.cpp
===================================================================
--- lib/Analysis/CloneDetection.cpp
+++ lib/Analysis/CloneDetection.cpp
@@ -82,166 +82,6 @@
 
 SourceLocation StmtSequence::getEndLoc() const { return back()->getLocEnd(); }
 
-namespace {
-
-/// \brief Analyzes the pattern of the referenced variables in a statement.
-class VariablePattern {
-
-  /// \brief Describes an occurence of a variable reference in a statement.
-  struct VariableOccurence {
-    /// The index of the associated VarDecl in the Variables vector.
-    size_t KindID;
-    /// The source range in the code where the variable was referenced.
-    SourceRange Range;
-
-    VariableOccurence(size_t KindID, SourceRange Range)
-        : KindID(KindID), Range(Range) {}
-  };
-
-  /// All occurences of referenced variables in the order of appearance.
-  std::vector<VariableOccurence> Occurences;
-  /// List of referenced variables in the order of appearance.
-  /// Every item in this list is unique.
-  std::vector<const VarDecl *> Variables;
-
-  /// \brief Adds a new variable referenced to this pattern.
-  /// \param VarDecl The declaration of the variable that is referenced.
-  /// \param Range The SourceRange where this variable is referenced.
-  void addVariableOccurence(const VarDecl *VarDecl, SourceRange Range) {
-    // First check if we already reference this variable
-    for (size_t KindIndex = 0; KindIndex < Variables.size(); ++KindIndex) {
-      if (Variables[KindIndex] == VarDecl) {
-        // If yes, add a new occurence that points to the existing entry in
-        // the Variables vector.
-        Occurences.emplace_back(KindIndex, Range);
-        return;
-      }
-    }
-    // If this variable wasn't already referenced, add it to the list of
-    // referenced variables and add a occurence that points to this new entry.
-    Occurences.emplace_back(Variables.size(), Range);
-    Variables.push_back(VarDecl);
-  }
-
-  /// \brief Adds each referenced variable from the given statement.
-  void addVariables(const Stmt *S) {
-    // Sometimes we get a nullptr (such as from IfStmts which often have nullptr
-    // children). We skip such statements as they don't reference any
-    // variables.
-    if (!S)
-      return;
-
-    // Check if S is a reference to a variable. If yes, add it to the pattern.
-    if (auto D = dyn_cast<DeclRefExpr>(S)) {
-      if (auto VD = dyn_cast<VarDecl>(D->getDecl()->getCanonicalDecl()))
-        addVariableOccurence(VD, D->getSourceRange());
-    }
-
-    // Recursively check all children of the given statement.
-    for (const Stmt *Child : S->children()) {
-      addVariables(Child);
-    }
-  }
-
-public:
-  /// \brief Creates an VariablePattern object with information about the given
-  ///        StmtSequence.
-  VariablePattern(const StmtSequence &Sequence) {
-    for (const Stmt *S : Sequence)
-      addVariables(S);
-  }
-
-  /// \brief Counts the differences between this pattern and the given one.
-  /// \param Other The given VariablePattern to compare with.
-  /// \param FirstMismatch Output parameter that will be filled with information
-  ///        about the first difference between the two patterns. This parameter
-  ///        can be a nullptr, in which case it will be ignored.
-  /// \return Returns the number of differences between the pattern this object
-  ///         is following and the given VariablePattern.
-  ///
-  /// For example, the following statements all have the same pattern and this
-  /// function would return zero:
-  ///
-  ///   if (a < b) return a; return b;
-  ///   if (x < y) return x; return y;
-  ///   if (u2 < u1) return u2; return u1;
-  ///
-  /// But the following statement has a different pattern (note the changed
-  /// variables in the return statements) and would have two differences when
-  /// compared with one of the statements above.
-  ///
-  ///   if (a < b) return b; return a;
-  ///
-  /// This function should only be called if the related statements of the given
-  /// pattern and the statements of this objects are clones of each other.
-  unsigned countPatternDifferences(
-      const VariablePattern &Other,
-      CloneDetector::SuspiciousClonePair *FirstMismatch = nullptr) {
-    unsigned NumberOfDifferences = 0;
-
-    assert(Other.Occurences.size() == Occurences.size());
-    for (unsigned i = 0; i < Occurences.size(); ++i) {
-      auto ThisOccurence = Occurences[i];
-      auto OtherOccurence = Other.Occurences[i];
-      if (ThisOccurence.KindID == OtherOccurence.KindID)
-        continue;
-
-      ++NumberOfDifferences;
-
-      // If FirstMismatch is not a nullptr, we need to store information about
-      // the first difference between the two patterns.
-      if (FirstMismatch == nullptr)
-        continue;
-
-      // Only proceed if we just found the first difference as we only store
-      // information about the first difference.
-      if (NumberOfDifferences != 1)
-        continue;
-
-      const VarDecl *FirstSuggestion = nullptr;
-      // If there is a variable available in the list of referenced variables
-      // which wouldn't break the pattern if it is used in place of the
-      // current variable, we provide this variable as the suggested fix.
-      if (OtherOccurence.KindID < Variables.size())
-        FirstSuggestion = Variables[OtherOccurence.KindID];
-
-      // Store information about the first clone.
-      FirstMismatch->FirstCloneInfo =
-          CloneDetector::SuspiciousClonePair::SuspiciousCloneInfo(
-              Variables[ThisOccurence.KindID], ThisOccurence.Range,
-              FirstSuggestion);
-
-      // Same as above but with the other clone. We do this for both clones as
-      // we don't know which clone is the one containing the unintended
-      // pattern error.
-      const VarDecl *SecondSuggestion = nullptr;
-      if (ThisOccurence.KindID < Other.Variables.size())
-        SecondSuggestion = Other.Variables[ThisOccurence.KindID];
-
-      // Store information about the second clone.
-      FirstMismatch->SecondCloneInfo =
-          CloneDetector::SuspiciousClonePair::SuspiciousCloneInfo(
-              Variables[ThisOccurence.KindID], OtherOccurence.Range,
-              SecondSuggestion);
-
-      // SuspiciousClonePair guarantees that the first clone always has a
-      // suggested variable associated with it. As we know that one of the two
-      // clones in the pair always has suggestion, we swap the two clones
-      // in case the first clone has no suggested variable which means that
-      // the second clone has a suggested variable and should be first.
-      if (!FirstMismatch->FirstCloneInfo.Suggestion)
-        std::swap(FirstMismatch->FirstCloneInfo,
-                  FirstMismatch->SecondCloneInfo);
-
-      // This ensures that we always have at least one suggestion in a pair.
-      assert(FirstMismatch->FirstCloneInfo.Suggestion);
-    }
-
-    return NumberOfDifferences;
-  }
-};
-}
-
 /// \brief Prints the macro name that contains the given SourceLocation into
 ///        the given raw_string_ostream.
 static void printMacroName(llvm::raw_string_ostream &MacroStack,
@@ -584,6 +424,76 @@
   Sequences.push_back(std::make_pair(Signature, S));
 }
 
+void CloneDetector::constrainClones(
+    const std::vector<CloneGroup> &CloneGroups,
+    std::vector<CloneDetector::CloneGroup> &Result,
+    ConstraintList &Constraints) {
+  for (auto &HashGroup : CloneGroups) {
+    // Contains all indexes in HashGroup that were already added to a
+    // CloneGroup.
+    std::vector<char> Indexes;
+    Indexes.resize(HashGroup.Sequences.size());
+
+    for (unsigned i = 0; i < HashGroup.Sequences.size(); ++i) {
+      // Skip indexes that are already part of a CloneGroup.
+      if (Indexes[i])
+        continue;
+
+      // Pick the first unhandled StmtSequence and consider it as the beginning
+      // of a new CloneGroup for now.
+      // We don't add i to Indexes because we never iterate back.
+      StmtSequence Prototype = HashGroup.Sequences[i];
+      CloneDetector::CloneGroup PotentialGroup(Prototype, HashGroup.Signature);
+      ++Indexes[i];
+
+      if (!std::all_of(Constraints.begin(), Constraints.end(),
+                       [&Prototype, &PotentialGroup](Constraint *Constraint) {
+                         return Constraint->acceptsPrototype(Prototype,
+                                                             PotentialGroup);
+                       }))
+        continue;
+
+      // Check all following StmtSequences for clones.
+      for (unsigned j = i + 1; j < HashGroup.Sequences.size(); ++j) {
+        // Skip indexes that are already part of a CloneGroup.
+        if (Indexes[j])
+          continue;
+
+        // If a following StmtSequence belongs to our CloneGroup, we add it to
+        // it.
+        const StmtSequence &Candidate = HashGroup.Sequences[j];
+
+        if (!std::all_of(Constraints.begin(), Constraints.end(),
+                         [&Prototype, &Candidate](Constraint *Constraint) {
+                           return Constraint->acceptsCandidate(Prototype,
+                                                               Candidate);
+                         }))
+          continue;
+
+        PotentialGroup.Sequences.push_back(Candidate);
+        // Make sure we never visit this StmtSequence again.
+        ++Indexes[j];
+      }
+
+      if (!std::all_of(Constraints.begin(), Constraints.end(),
+                       [&PotentialGroup](Constraint *Constraint) {
+                         return Constraint->acceptsGroup(PotentialGroup);
+                       }))
+        continue;
+
+      // Otherwise, add it to the result and continue searching for more groups.
+      Result.push_back(PotentialGroup);
+    }
+
+    assert(std::all_of(Indexes.begin(), Indexes.end(),
+                       [](char c) { return c == 1; }));
+  }
+
+  for (Constraint *P : Constraints) {
+    P->constrainResult(Result);
+  }
+}
+
 namespace {
 /// \brief Returns true if and only if \p Stmt contains at least one other
 /// sequence in the \p Group.
@@ -614,6 +524,66 @@
 }
 } // end anonymous namespace
 
+void CloneDetector::findClones(std::vector<CloneGroup> &Result,
+                               ConstraintList &Constraints) {
+  // Shortcut and necessary for the for-loop later in this function.
+  if (Sequences.empty())
+    return;
+
+  std::stable_sort(Sequences.begin(), Sequences.end(),
+                   [](std::pair<CloneSignature, StmtSequence> LHS,
+                      std::pair<CloneSignature, StmtSequence> RHS) {
+                     return LHS.first.Hash < RHS.first.Hash;
+                   });
+
+  std::vector<CloneGroup> CloneGroups;
+
+  // Check for each CloneSignature if its successor has the same hash value.
+  // We don't check the last CloneSignature as it has no successor.
+  // Note: The 'size - 1 ' in the condition is safe because we check for an
+  // empty Sequences vector at the beginning of this function.
+  for (unsigned i = 0; i < Sequences.size() - 1; ++i) {
+    const auto Current = Sequences[i];
+    const auto Next = Sequences[i + 1];
+
+    if (Current.first.Hash != Next.first.Hash)
+      continue;
+
+    // It's likely that we just found an sequence of CloneSignatures that
+    // represent a CloneGroup, so we create a new group and start checking and
+    // adding the CloneSignatures in this sequence.
+    CloneGroup Group;
+    Group.Signature = Current.first;
+
+    for (; i < Sequences.size(); ++i) {
+      const auto &Signature = Sequences[i];
+
+      // A different hash value means we have reached the end of the sequence.
+      if (Current.first.Hash != Signature.first.Hash) {
+        // The current Signature could be the start of a new CloneGroup. So we
+        // decrement i so that we visit it again in the outer loop.
+        // Note: i can never be 0 at this point because we are just comparing
+        // the hash of the Current CloneSignature with itself in the 'if' above.
+        assert(i != 0);
+        --i;
+        break;
+      }
+
+      Group.Sequences.push_back(Signature.second);
+    }
+
+    if (!std::all_of(Constraints.begin(), Constraints.end(),
+                     [&Group](Constraint *Constraint) {
+                       return Constraint->acceptsHashGroup(Group);
+                     }))
+      continue;
+
+    CloneGroups.push_back(Group);
+  }
+
+  constrainClones(CloneGroups, Result, Constraints);
+}
+
 namespace {
 /// \brief Wrapper around FoldingSetNodeID that it can be used as the template
 ///        argument of the StmtDataCollector.
@@ -663,145 +633,129 @@
   return DataLHS == DataRHS;
 }
 
-/// \brief Finds all actual clone groups in a single group of presumed clones.
-/// \param Result Output parameter to which all found groups are added.
-/// \param Group A group of presumed clones. The clones are allowed to have a
-///              different variable pattern and may not be actual clones of each
-///              other.
-/// \param CheckVariablePatterns If true, every clone in a group that was added
-///              to the output follows the same variable pattern as the other
-///              clones in its group.
-static void createCloneGroups(std::vector<CloneDetector::CloneGroup> &Result,
-                              const CloneDetector::CloneGroup &Group,
-                              bool CheckVariablePattern) {
-  // We remove the Sequences one by one, so a list is more appropriate.
-  std::list<StmtSequence> UnassignedSequences(Group.Sequences.begin(),
-                                              Group.Sequences.end());
-
-  // Search for clones as long as there could be clones in UnassignedSequences.
-  while (UnassignedSequences.size() > 1) {
-
-    // Pick the first Sequence as a protoype for a new clone group.
-    StmtSequence Prototype = UnassignedSequences.front();
-    UnassignedSequences.pop_front();
-
-    CloneDetector::CloneGroup FilteredGroup(Prototype, Group.Signature);
-
-    // Analyze the variable pattern of the prototype. Every other StmtSequence
-    // needs to have the same pattern to get into the new clone group.
-    VariablePattern PrototypeFeatures(Prototype);
-
-    // Search all remaining StmtSequences for an identical variable pattern
-    // and assign them to our new clone group.
-    auto I = UnassignedSequences.begin(), E = UnassignedSequences.end();
-    while (I != E) {
-      // If the sequence doesn't fit to the prototype, we have encountered
-      // an unintended hash code collision and we skip it.
-      if (!areSequencesClones(Prototype, *I)) {
-        ++I;
-        continue;
-      }
-
-      // If we weren't asked to check for a matching variable pattern in clone
-      // groups we can add the sequence now to the new clone group.
-      // If we were asked to check for matching variable pattern, we first have
-      // to check that there are no differences between the two patterns and
-      // only proceed if they match.
-      if (!CheckVariablePattern ||
-          VariablePattern(*I).countPatternDifferences(PrototypeFeatures) == 0) {
-        FilteredGroup.Sequences.push_back(*I);
-        I = UnassignedSequences.erase(I);
-        continue;
-      }
-
-      // We didn't found a matching variable pattern, so we continue with the
-      // next sequence.
-      ++I;
+void VariablePattern::addVariableOccurence(const VarDecl *VarDecl,
+                                           SourceRange Location) {
+  // First check if we already reference this variable
+  for (size_t KindIndex = 0; KindIndex < Variables.size(); ++KindIndex) {
+    if (Variables[KindIndex] == VarDecl) {
+      // If yes, add a new occurence that points to the existing entry in
+      // the Variables vector.
+      Occurences.emplace_back(KindIndex, Location);
+      return;
     }
-
-    // Add a valid clone group to the list of found clone groups.
-    if (!FilteredGroup.isValid())
-      continue;
-
-    Result.push_back(FilteredGroup);
   }
+  // If this variable wasn't already referenced, add it to the list of
+  // referenced variables and add a occurence that points to this new entry.
+  Occurences.emplace_back(Variables.size(), Location);
+  Variables.push_back(VarDecl);
 }
 
-void CloneDetector::findClones(std::vector<CloneGroup> &Result,
-                               unsigned MinGroupComplexity,
-                               bool CheckPatterns) {
-  // A shortcut (and necessary for the for-loop later in this function).
-  if (Sequences.empty())
+void VariablePattern::addVariables(const Stmt *S) {
+  // Sometimes we get a nullptr (such as from IfStmts which often have nullptr
+  // children). We skip such statements as they don't reference any
+  // variables.
+  if (!S)
     return;
 
-  // We need to search for groups of StmtSequences with the same hash code to
-  // create our initial clone groups. By sorting all known StmtSequences by
-  // their hash value we make sure that StmtSequences with the same hash code
-  // are grouped together in the Sequences vector.
-  // Note: We stable sort here because the StmtSequences are added in the order
-  // in which they appear in the source file. We want to preserve that order
-  // because we also want to report them in that order in the CloneChecker.
-  std::stable_sort(Sequences.begin(), Sequences.end(),
-                   [](std::pair<CloneSignature, StmtSequence> LHS,
-                      std::pair<CloneSignature, StmtSequence> RHS) {
-                     return LHS.first.Hash < RHS.first.Hash;
-                   });
+  // Check if S is a reference to a variable. If yes, add it to the pattern.
+  if (auto D = dyn_cast<DeclRefExpr>(S)) {
+    if (auto VD = dyn_cast<VarDecl>(D->getDecl()->getCanonicalDecl()))
+      addVariableOccurence(VD, D->getSourceRange());
+  }
+  // Recursively check all children of the given statement.
+  for (const Stmt *Child : S->children()) {
+    addVariables(Child);
+  }
+}
 
-  std::vector<CloneGroup> CloneGroups;
+unsigned
+VariablePattern::getPatternDifferences(const VariablePattern &Other,
+                                       SuspiciousClonePair *FirstMismatch) {
+  unsigned NumberOfDifferences = 0;
 
-  // Check for each CloneSignature if its successor has the same hash value.
-  // We don't check the last CloneSignature as it has no successor.
-  // Note: The 'size - 1' in the condition is safe because we check for an empty
-  // Sequences vector at the beginning of this function.
-  for (unsigned i = 0; i < Sequences.size() - 1; ++i) {
-    const auto Current = Sequences[i];
-    const auto Next = Sequences[i + 1];
+  assert(Other.Occurences.size() == Occurences.size());
+  for (unsigned i = 0; i < Occurences.size(); ++i) {
+    auto ThisOccurence = Occurences[i];
+    auto OtherOccurence = Other.Occurences[i];
+    if (ThisOccurence.KindID != OtherOccurence.KindID) {
+      ++NumberOfDifferences;
 
-    if (Current.first.Hash != Next.first.Hash)
-      continue;
+      // If FirstMismatch is not a nullptr, we need to store information about
+      // the first difference between the two patterns.
+      if (FirstMismatch == nullptr)
+        continue;
+      const VarDecl *FirstSuggestion = nullptr;
+      // If there is a variable available in the list of referenced variables
+      // which wouldn't break the pattern if it is used in place of the
+      // current variable, we provide this variable as the suggested fix.
+      if (OtherOccurence.KindID < Variables.size())
+        FirstSuggestion = Variables[OtherOccurence.KindID];
 
-    // It's likely that we just found an sequence of CloneSignatures that
-    // represent a CloneGroup, so we create a new group and start checking and
-    // adding the CloneSignatures in this sequence.
-    CloneGroup Group;
-    Group.Signature = Current.first;
+      // Store information about the first clone.
+      FirstMismatch->FirstCloneInfo = SuspiciousClonePair::SuspiciousCloneInfo(
+          Variables[ThisOccurence.KindID], ThisOccurence.Location,
+          FirstSuggestion);
 
-    for (; i < Sequences.size(); ++i) {
-      const auto &Signature = Sequences[i];
+      // Same as above but with the other clone. We do this for both clones as
+      // we don't know which clone is the one containing the unintended
+      // pattern error.
+      const VarDecl *SecondSuggestion = nullptr;
+      if (ThisOccurence.KindID < Other.Variables.size())
+        SecondSuggestion = Other.Variables[ThisOccurence.KindID];
 
-      // A different hash value means we have reached the end of the sequence.
-      if (Current.first.Hash != Signature.first.Hash) {
-        // The current Signature could be the start of a new CloneGroup. So we
-        // decrement i so that we visit it again in the outer loop.
-        // Note: i can never be 0 at this point because we are just comparing
-        // the hash of the Current CloneSignature with itself in the 'if' above.
-        assert(i != 0);
-        --i;
-        break;
-      }
+      // Store information about the second clone.
+      FirstMismatch->SecondCloneInfo = SuspiciousClonePair::SuspiciousCloneInfo(
+          Variables[ThisOccurence.KindID], OtherOccurence.Location,
+          SecondSuggestion);
 
-      // Skip CloneSignatures that won't pass the complexity requirement.
-      if (Signature.first.Complexity < MinGroupComplexity)
-        continue;
+      // SuspiciousClonePair guarantees that the first clone always has a
+      // suggested variable associated with it. As we know that one of the two
+      // clones in the pair always has suggestion, we swap the two clones
+      // in case the first clone has no suggested variable which means that
+      // the second clone has a suggested variable and should be first.
+      if (!FirstMismatch->FirstCloneInfo.Suggestion)
+        std::swap(FirstMismatch->FirstCloneInfo,
+                  FirstMismatch->SecondCloneInfo);
 
-      Group.Sequences.push_back(Signature.second);
+      // This ensures that we always have at least one suggestion in a pair.
+      assert(FirstMismatch->FirstCloneInfo.Suggestion);
     }
+  }
 
-    // There is a chance that we haven't found more than two fitting
-    // CloneSignature because not enough CloneSignatures passed the complexity
-    // requirement. As a CloneGroup with less than two members makes no sense,
-    // we ignore this CloneGroup and won't add it to the result.
-    if (!Group.isValid())
-      continue;
+  return NumberOfDifferences;
+}
 
-    CloneGroups.push_back(Group);
-  }
+bool CloneDetector::Constraint::acceptsHashGroup(
+    const CloneDetector::CloneGroup &HashGroup) {
+  return true;
+}
 
-  // Add every valid clone group that fulfills the complexity requirement.
-  for (const CloneGroup &Group : CloneGroups) {
-    createCloneGroups(Result, Group, CheckPatterns);
-  }
+bool CloneDetector::Constraint::acceptsPrototype(
+    const StmtSequence &Prototype,
+    const CloneDetector::CloneGroup &PrototypeGroup) {
+  return true;
+}
 
+bool CloneDetector::Constraint::acceptsCandidate(
+    const StmtSequence &Prototype, const StmtSequence &Candidate) {
+  return true;
+}
+
+bool CloneDetector::Constraint::acceptsGroup(
+    const CloneDetector::CloneGroup &Group) {
+  return true;
+}
+
+void CloneDetector::Constraint::constrainResult(
+    std::vector<CloneDetector::CloneGroup> &Result) {}
+
+bool VerifyCloneHashesConstraint::acceptsCandidate(
+    const StmtSequence &Prototype, const StmtSequence &Candidate) {
+  return areSequencesClones(Prototype, Candidate);
+}
+
+void OnlyLargestCloneConstraint::constrainResult(
+    std::vector<CloneDetector::CloneGroup> &Result) {
   std::vector<unsigned> IndexesToRemove;
 
   // Compare every group in the result with the rest. If one groups contains
@@ -828,37 +782,3 @@
     Result.erase(Result.begin() + *I);
   }
 }
-
-void CloneDetector::findSuspiciousClones(
-    std::vector<CloneDetector::SuspiciousClonePair> &Result,
-    unsigned MinGroupComplexity) {
-  std::vector<CloneGroup> Clones;
-  // Reuse the normal search for clones but specify that the clone groups don't
-  // need to have a common referenced variable pattern so that we can manually
-  // search for the kind of pattern errors this function is supposed to find.
-  findClones(Clones, MinGroupComplexity, false);
-
-  for (const CloneGroup &Group : Clones) {
-    for (unsigned i = 0; i < Group.Sequences.size(); ++i) {
-      VariablePattern PatternA(Group.Sequences[i]);
-
-      for (unsigned j = i + 1; j < Group.Sequences.size(); ++j) {
-        VariablePattern PatternB(Group.Sequences[j]);
-
-        CloneDetector::SuspiciousClonePair ClonePair;
-        // For now, we only report clones which break the variable pattern just
-        // once because multiple differences in a pattern are an indicator that
-        // those differences are maybe intended (e.g. because it's actually
-        // a different algorithm).
-        // TODO: In very big clones even multiple variables can be unintended,
-        // so replacing this number with a percentage could better handle such
-        // cases. On the other hand it could increase the false-positive rate
-        // for all clones if the percentage is too high.
-        if (PatternA.countPatternDifferences(PatternB, &ClonePair) == 1) {
-          Result.push_back(ClonePair);
-          break;
-        }
-      }
-    }
-  }
-}
Index: include/clang/Analysis/CloneDetection.h
===================================================================
--- include/clang/Analysis/CloneDetection.h
+++ include/clang/Analysis/CloneDetection.h
@@ -17,7 +17,6 @@
 
 #include "clang/Basic/SourceLocation.h"
 #include "llvm/ADT/Hashing.h"
-#include "llvm/ADT/StringMap.h"
 
 #include <vector>
 
@@ -158,6 +157,7 @@
 /// (e.g. function bodies). Other clones (e.g. cloned comments or declarations)
 /// are not supported.
 class CloneDetector {
+
 public:
   typedef unsigned DataPiece;
 
@@ -201,8 +201,7 @@
   };
 
   /// Holds group of StmtSequences that are clones of each other and the
-  /// complexity value (see CloneSignature::Complexity) that all stored
-  /// StmtSequences have in common.
+  /// CloneSignature that all clones have in common.
   struct CloneGroup {
     std::vector<StmtSequence> Sequences;
     CloneSignature Signature;
@@ -213,13 +212,6 @@
         : Signature(Signature) {
       Sequences.push_back(Seq);
     }
-
-    /// \brief Returns false if and only if this group should be skipped when
-    ///        searching for clones.
-    bool isValid() const {
-      // A clone group with only one member makes no sense, so we skip them.
-      return Sequences.size() > 1;
-    }
   };
 
   /// \brief Generates and stores search data for all statements in the body of
@@ -229,17 +221,140 @@
   /// \brief Stores the CloneSignature to allow future querying.
   void add(const StmtSequence &S, const CloneSignature &Signature);
 
-  /// \brief Searches the provided statements for clones.
+  /// \brief Represents a way to filter clones from a list of clone groups.
   ///
-  /// \param Result Output parameter that is filled with a list of found
-  ///               clone groups. Each group contains multiple StmtSequences
-  ///               that were identified to be clones of each other.
-  /// \param MinGroupComplexity Only return clones which have at least this
-  ///                           complexity value.
-  /// \param CheckPatterns Returns only clone groups in which the referenced
-  ///                      variables follow the same pattern.
-  void findClones(std::vector<CloneGroup> &Result, unsigned MinGroupComplexity,
-                  bool CheckPatterns = true);
+  /// A reoccuring task in the CloneDetector is filtering out clones that don't
+  /// have certain properties. This filtering is done by reusable constraints
+  /// of which each removes clones with a undesireable property.
+  /// Usually, constraints are grouped together in a ConstraintList and are
+  /// applied one after another by the CloneDetector until all undesireable
+  /// clones are removed.
+  ///
+  /// To create a new constraint, inherit from this class and overwrite the
+  /// respective virtual methods.
+  class Constraint {
+  public:
+    virtual ~Constraint() {}
+
+    /// \brief Allows skipping the processing of a given hash group.
+    /// \param HashGroup The given hash group.
+    /// \return Return true if the CloneDetector should start creating actual
+    ///         clone groups from the given hash group.
+    virtual bool acceptsHashGroup(const CloneGroup &HashGroup);
+
+    /// \brief Allows skipping the creation of clone groups from a given
+    ///        prototype StmtSequence.
+    /// \param Prototype The given prototype StmtSequence.
+    /// \param PrototypeGroup The clone group to which the Prototype belongs.
+    /// \return Return true if the CloneDetector should search for candidates
+    ///         that will fit into the given prototype group.
+    virtual bool acceptsPrototype(const StmtSequence &Prototype,
+                                  const CloneGroup &PrototypeGroup);
+
+    /// \brief Decides if a StmtSequence belongs into a certain clone group.
+    /// \param Prototype The prototype that
+    /// \param Candidate
+    /// \return Return true if the candidate fits to the prototype.
+    virtual bool acceptsCandidate(const StmtSequence &Prototype,
+                                  const StmtSequence &Candidate);
+
+    /// \brief Decides if a given clone group should be added to the result.
+    /// \param Group The given clone group.
+    /// \return Return true if the given group should be included in the result.
+    ///
+    /// This function should be used to enforce context-sensitive constraints
+    /// for clones (i.e. where the constraint needs to access other clones in
+    /// the current clone group).
+    virtual bool acceptsGroup(const CloneGroup &Group);
+
+    /// \brief Can process and modify the clone groups before they are returned.
+    /// \param Result The list of clone groups.
+    ///
+    /// This function should be used to enforce context-sensitive constraints
+    /// for clone groups (i.e. where the constraint needs to access other clone
+    /// groups).
+    virtual void constrainResult(std::vector<CloneGroup> &Result);
+  };
+
+  /// \brief Stores a list of constraints.
+  ///
+  /// This class also takes over the memory ownership of all given constraints.
+  class ConstraintList {
+    std::vector<Constraint *> Constraints;
+
+  public:
+    ConstraintList(std::initializer_list<Constraint *> IL)
+        : Constraints(IL.begin(), IL.end()) {}
+    ~ConstraintList() {
+      for (Constraint *P : Constraints) {
+        delete P;
+      }
+    }
+
+    // Non-copyable because we free all passes in the destructor.
+    ConstraintList(const ConstraintList &other) = delete;
+    ConstraintList &operator=(const ConstraintList &) = delete;
+
+    std::vector<Constraint *>::iterator begin() { return Constraints.begin(); }
+    std::vector<Constraint *>::iterator end() { return Constraints.end(); }
+  };
+
+  /// \brief Generates hash groups and applies the given passes to them.
+  /// \param Result Output parameter to which all created clone groups are
+  ///               added.
+  /// \param Passes The passes that should be used to filter the hash groups.
+  void findClones(std::vector<CloneGroup> &Result, ConstraintList &Passes);
+
+  /// \brief Applies the given passes on a list of clone groups.
+  /// \param CloneGroups The list of clone groups.
+  /// \param Result Output parameter to which all created clone groups are
+  ///               added.
+  /// \param Passes The passes that should be applied.
+  static void constrainClones(const std::vector<CloneGroup> &CloneGroups,
+                              std::vector<CloneGroup> &Result,
+                              ConstraintList &Passes);
+
+private:
+  /// Stores all encountered StmtSequences alongside their CloneSignature.
+  std::vector<std::pair<CloneSignature, StmtSequence>> Sequences;
+};
+
+/// \brief Analyzes the pattern of the referenced variables in a statement.
+class VariablePattern {
+
+  /// \brief Describes an occurence of a variable reference in a statement.
+  struct VariableOccurence {
+    /// The index of the associated VarDecl in the Variables vector.
+    size_t KindID;
+    /// The location in the code where the variable was references.
+    SourceRange Location;
+
+    VariableOccurence(size_t KindID, SourceRange Location)
+        : KindID(KindID), Location(Location) {}
+  };
+
+  /// All occurences of referenced variables in the order of appearance.
+  std::vector<VariableOccurence> Occurences;
+  /// List of referenced variables in the order of appearance.
+  /// Every item in this list is unique.
+  std::vector<const VarDecl *> Variables;
+
+  /// \brief Adds a new variable referenced to this pattern.
+  /// \param VarDecl The declaration of the variable that is referenced.
+  /// \param Location The SourceRange where this variable is referenced.
+  void addVariableOccurence(const VarDecl *VarDecl, SourceRange Location);
+
+  /// \brief Adds each referenced variable from the given statement.
+  void addVariables(const Stmt *S);
+
+public:
+  /// \brief Creates an VariablePattern object with information about the given
+  ///        StmtSequence.
+  VariablePattern(const StmtSequence &Sequence) {
+    for (const Stmt *S : Sequence)
+      addVariables(S);
+  }
+  VariablePattern() {}
 
   /// \brief Describes two clones that reference their variables in a different
   ///        pattern which could indicate a programming error.
@@ -266,17 +381,111 @@
     SuspiciousCloneInfo SecondCloneInfo;
   };
 
-  /// \brief Searches the provided statements for pairs of clones that don't
-  ///        follow the same pattern when referencing variables.
-  /// \param Result Output parameter that will contain the clone pairs.
-  /// \param MinGroupComplexity Only clone pairs in which the clones have at
-  ///                           least this complexity value.
-  void findSuspiciousClones(std::vector<SuspiciousClonePair> &Result,
-                            unsigned MinGroupComplexity);
+  /// \brief Compares this pattern with the given one.
+  /// \param Other The given VariablePattern to compare with.
+  /// \param FirstMismatch Output parameter that will be filled with information
+  ///        about the first difference between the two patterns. This parameter
+  ///        can be a nullptr, in which case it will be ignored.
+  /// \return Returns the number of differences between the pattern this object
+  ///         is following and the given VariablePattern.
+  ///
+  /// For example, the following statements all have the same pattern and this
+  /// function would return zero:
+  ///
+  ///   if (a < b) return a; return b;
+  ///   if (x < y) return x; return y;
+  ///   if (u2 < u1) return u2; return u1;
+  ///
+  /// But the following statement has a different pattern (note the changed
+  /// variables in the return statements) and would have two differences when
+  /// compared with one of the statements above.
+  ///
+  ///   if (a < b) return b; return a;
+  ///
+  /// This function should only be called if the related statements of the given
+  /// pattern and the statements of this objects are clones of each other.
+  unsigned getPatternDifferences(const VariablePattern &Other,
+                                 SuspiciousClonePair *FirstMismatch = nullptr);
+};
+
+/// \brief Filters out all clones that don't reference variables in the same
+///        pattern.
+class MatchingVariablePatternConstraint : public CloneDetector::Constraint {
+  VariablePattern PrototypePattern;
 
-private:
-  /// Stores all encountered StmtSequences alongside their CloneSignature.
-  std::vector<std::pair<CloneSignature, StmtSequence>> Sequences;
+public:
+  virtual ~MatchingVariablePatternConstraint() {}
+
+  virtual bool
+  acceptsPrototype(const StmtSequence &Prototype,
+                   const CloneDetector::CloneGroup &PrototypeGroup) override {
+    PrototypePattern = VariablePattern(Prototype);
+    return true;
+  }
+
+  virtual bool acceptsCandidate(const StmtSequence &Prototype,
+                                const StmtSequence &Candidate) override {
+    VariablePattern CandidatePattern(Candidate);
+    return PrototypePattern.getPatternDifferences(CandidatePattern) == 0;
+  }
+};
+
+/// \brief Filters all clone groups that don't have at least the given size.
+class MinGroupSizeConstraint : public CloneDetector::Constraint {
+  unsigned MinGroupSize;
+
+public:
+  virtual ~MinGroupSizeConstraint() {}
+  MinGroupSizeConstraint(unsigned MinGroupSize = 2)
+      : MinGroupSize(MinGroupSize) {}
+
+  virtual bool
+  acceptsHashGroup(const CloneDetector::CloneGroup &HashGroup) override {
+    return HashGroup.Sequences.size() >= MinGroupSize;
+  }
+
+  virtual bool
+  acceptsGroup(const CloneDetector::CloneGroup &PrototypeGroup) override {
+    return PrototypeGroup.Sequences.size() >= MinGroupSize;
+  }
+};
+
+/// \brief Filters all clones that don't have at least the given complexity
+///        value.
+class MinComplexityConstraint : public CloneDetector::Constraint {
+  unsigned MinComplexity;
+
+public:
+  virtual ~MinComplexityConstraint() {}
+  MinComplexityConstraint(unsigned MinComplexity = 10)
+      : MinComplexity(MinComplexity) {}
+
+  virtual bool
+  acceptsPrototype(const StmtSequence &Prototype,
+                   const CloneDetector::CloneGroup &PrototypeGroup) override {
+    return PrototypeGroup.Signature.Complexity >= MinComplexity;
+  }
+};
+
+/// \brief Filters all presumed clones from a hash group that aren't actually
+///        clones but landed in the hash group due to unintended hash code
+///        collisions.
+class VerifyCloneHashesConstraint : public CloneDetector::Constraint {
+public:
+  virtual ~VerifyCloneHashesConstraint() {}
+
+  virtual bool acceptsCandidate(const StmtSequence &Prototype,
+                                const StmtSequence &Candidate) override;
+};
+
+/// \brief This constraint removes all clone groups which are a full subset of
+///        another found clone group.
+class OnlyLargestCloneConstraint : public CloneDetector::Constraint {
+public:
+  virtual ~OnlyLargestCloneConstraint() {}
+
+  virtual void
+  constrainResult(std::vector<CloneDetector::CloneGroup> &Result) override;
 };
 
 } // end namespace clang
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to