Author: rohini Date: Mon Oct 8 22:26:48 2018 New Revision: 1843210 URL: http://svn.apache.org/viewvc?rev=1843210&view=rev Log: PIG-5357: BagFactory interface should support creating a distinct bag from a set (jtolar via rohini)
Modified: pig/trunk/CHANGES.txt pig/trunk/src/org/apache/pig/data/BagFactory.java pig/trunk/src/org/apache/pig/data/DefaultBagFactory.java pig/trunk/src/org/apache/pig/data/DistinctDataBag.java Modified: pig/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/pig/trunk/CHANGES.txt?rev=1843210&r1=1843209&r2=1843210&view=diff ============================================================================== --- pig/trunk/CHANGES.txt (original) +++ pig/trunk/CHANGES.txt Mon Oct 8 22:26:48 2018 @@ -26,6 +26,8 @@ PIG-5282: Upgade to Java 8 (satishsaley IMPROVEMENTS +PIG-5357: BagFactory interface should support creating a distinct bag from a set (jtolar via rohini) + PIG-5354: Show fieldname and a line number for casting errors (knoguchi) PIG-5342: Add setting to turn off bloom join combiner (satishsaley via rohini) Modified: pig/trunk/src/org/apache/pig/data/BagFactory.java URL: http://svn.apache.org/viewvc/pig/trunk/src/org/apache/pig/data/BagFactory.java?rev=1843210&r1=1843209&r2=1843210&view=diff ============================================================================== --- pig/trunk/src/org/apache/pig/data/BagFactory.java (original) +++ pig/trunk/src/org/apache/pig/data/BagFactory.java Mon Oct 8 22:26:48 2018 @@ -23,6 +23,7 @@ import java.net.URL; import java.net.URLClassLoader; import java.util.Comparator; import java.util.List; +import java.util.Set; import org.apache.pig.classification.InterfaceAudience; import org.apache.pig.classification.InterfaceStability; @@ -127,6 +128,21 @@ public abstract class BagFactory { public abstract DataBag newDistinctBag(); /** + * Get a distinct data bag. Distinct bags guarantee that when an + * iterator is opened on the bag, no two tuples returned from the + * iterator will be equal. + * @param tuples distinct set of tuples + * @return distinct data bag + */ + public DataBag newDistinctBag(Set<Tuple> tuples) { + DataBag bag = newDistinctBag(); + for (Tuple t : tuples) { + bag.add(t); + } + return bag; + } + + /** * Construct a new BagFactory */ protected BagFactory() { Modified: pig/trunk/src/org/apache/pig/data/DefaultBagFactory.java URL: http://svn.apache.org/viewvc/pig/trunk/src/org/apache/pig/data/DefaultBagFactory.java?rev=1843210&r1=1843209&r2=1843210&view=diff ============================================================================== --- pig/trunk/src/org/apache/pig/data/DefaultBagFactory.java (original) +++ pig/trunk/src/org/apache/pig/data/DefaultBagFactory.java Mon Oct 8 22:26:48 2018 @@ -19,6 +19,7 @@ package org.apache.pig.data; import java.util.Comparator; import java.util.List; +import java.util.Set; /** * Default implementation of BagFactory. @@ -76,6 +77,21 @@ public class DefaultBagFactory extends B return b; } + /** + * Get a distinct data bag. + * @param tuples Distinct set of tuples used to initialize the bag. + * If null, an empty bag is returned. + */ + @Override + public DataBag newDistinctBag(Set<Tuple> tuples) { + if (tuples == null) { + return newDistinctBag(); + } + + DataBag b = new DistinctDataBag(tuples); + return b; + } + DefaultBagFactory() { super(); } Modified: pig/trunk/src/org/apache/pig/data/DistinctDataBag.java URL: http://svn.apache.org/viewvc/pig/trunk/src/org/apache/pig/data/DistinctDataBag.java?rev=1843210&r1=1843209&r2=1843210&view=diff ============================================================================== --- pig/trunk/src/org/apache/pig/data/DistinctDataBag.java (original) +++ pig/trunk/src/org/apache/pig/data/DistinctDataBag.java Mon Oct 8 22:26:48 2018 @@ -32,6 +32,7 @@ import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; import java.util.ListIterator; +import java.util.Set; import java.util.TreeSet; import org.apache.commons.logging.Log; @@ -63,6 +64,13 @@ public class DistinctDataBag extends Def mContents = new HashSet<Tuple>(); } + public DistinctDataBag(Set<Tuple> tuples) { + mContents = tuples; + + mSize = mContents.size(); + markSpillableIfNecessary(); + } + @Override public boolean isSorted() { return false; @@ -227,7 +235,7 @@ public class DistinctDataBag extends Def DistinctDataBagIterator() { // If this is the first read, we need to sort the data. synchronized (mContents) { - if (mContents instanceof HashSet) { + if (mContents instanceof Set) { preMerge(); // We're the first reader, we need to sort the data. // This is in case it gets dumped under us.