I am trying to write a GenericUDF function to collect all of a specific struct
field(s) within an array for each record, and return them in an array as well.
I wrote the UDF (as below), and it seems to work but:
1) It does not work when I am performing this on an external table, it works
fine on a managed table, any idea?
2) I am having a tough time writing a test on this. I have attached the test I
have so far, and it does not work, always getting 'java.util.ArrayList cannot
be cast to org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector' or
cannot cast String to LazyString', my question is how do I supply a list of
structs for the evalue method?
Any help will be greatly appreciated.
Thanks,Peter
The table:
CREATE EXTERNAL TABLE FOO ( TS string, customerId string, products
array< struct<productCategory:string> > ) PARTITIONED BY (ds string) ROW
FORMAT SERDE 'some.serde' WITH SERDEPROPERTIES ('error.ignore'='true')
LOCATION 'some_locations' ;
A row of record holds:1340321132000, 'some_company',
[{"productCategory":"footwear"},{"productCategory":"eyewear"}]
This is my code:
import org.apache.hadoop.hive.ql.exec.Description;import
org.apache.hadoop.hive.ql.exec.UDFArgumentException;import
org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;import
org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;import
org.apache.hadoop.hive.ql.metadata.HiveException;import
org.apache.hadoop.hive.ql.udf.generic.GenericUDF;import
org.apache.hadoop.hive.serde2.lazy.LazyString;import
org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;import
org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;import
org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;import
org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;import
org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;import
org.apache.hadoop.hive.serde2.objectinspector.StructField;import
org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;import
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;import
org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;import
org.apache.hadoop.io.Text;
import java.util.ArrayList;
@Description(name = "extract_product_category", value = "_FUNC_( array<
struct<sha256:string> > ) - Collect all product category field values inside an
array of struct(s), and return the results in an array<string>",
extended = "Example:\n SELECT
_FUNC_(array_of_structs_with_product_category_field)")public class
GenericUDFExtractProductCategory extends GenericUDF{ private
ArrayList ret;
private ListObjectInspector listOI; private StructObjectInspector
structOI; private ObjectInspector prodCatOI;
@Override public ObjectInspector initialize(ObjectInspector[] args)
throws UDFArgumentException { if (args.length != 1) {
throw new UDFArgumentLengthException("The function extract_product_category()
requires exactly one argument."); }
if (args[0].getCategory() != Category.LIST) { throw new
UDFArgumentTypeException(0, "Type array<struct> is expected to be the argument
for extract_product_category but " + args[0].getTypeName() + " is found
instead"); }
listOI = ((ListObjectInspector) args[0]); structOI =
((StructObjectInspector) listOI.getListElementObjectInspector());
if (structOI.getAllStructFieldRefs().size() != 1) { throw
new UDFArgumentTypeException(0, "Incorrect number of fields in the struct,
should be one"); }
StructField productCategoryField =
structOI.getStructFieldRef("productCategory"); //If not, throw exception
if (productCategoryField == null) { throw new
UDFArgumentTypeException(0, "NO \"productCategory\" field in input structure");
}
//Are they of the correct types? //We store these object
inspectors for use in the evaluate() method prodCatOI =
productCategoryField.getFieldObjectInspector();
//First are they primitives if (prodCatOI.getCategory() !=
Category.PRIMITIVE) { throw new UDFArgumentTypeException(0,
"productCategory field must be of string type"); }
//Are they of the correct primitives? if
(((PrimitiveObjectInspector)prodCatOI).getPrimitiveCategory() !=
PrimitiveObjectInspector.PrimitiveCategory.STRING) { throw new
UDFArgumentTypeException(0, "productCategory field must be of string type");
}
ret = new ArrayList();
return
ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableStringObjectInspector);
}
@Override public ArrayList evaluate(DeferredObject[] arguments)
throws HiveException { ret.clear();
if (arguments.length != 1) { return null; }
if (arguments[0].get() == null) { return null; }
int numElements = listOI.getListLength(arguments[0].get());
for (int i = 0; i < numElements; i++) { LazyString
prodCatDataObject = (LazyString)
(structOI.getStructFieldData(listOI.getListElement(arguments[0].get(), i),
structOI.getStructFieldRef("productCategory"))); Text
productCategoryValue = ((StringObjectInspector)
prodCatOI).getPrimitiveWritableObject(prodCatDataObject);
ret.add(productCategoryValue); } return ret; }
@Override public String getDisplayString(String[] strings) {
assert (strings.length > 0); StringBuilder sb = new StringBuilder();
sb.append("extract_product_category("); sb.append(strings[0]);
sb.append(")"); return sb.toString(); }}
My Test:
import org.apache.hadoop.hive.ql.metadata.HiveException;import
org.apache.hadoop.hive.ql.udf.generic.GenericUDF;import
org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject;import
org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;import
org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;import
org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;import
org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;import
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;import
org.testng.annotations.Test;
import java.util.ArrayList;import java.util.List;
public class TestGenericUDFExtractShas{ ArrayList<String> fieldNames = new
ArrayList<String>(); ArrayList<ObjectInspector> fieldObjectInspectors = new
ArrayList<ObjectInspector>();
@Test public void simpleTest() throws Exception {
ListObjectInspector firstInspector = new MyListObjectInspector();
ArrayList test = new ArrayList(); test.add("test");
ArrayList test2 = new ArrayList(); test2.add(test);
StructObjectInspector soi =
ObjectInspectorFactory.getStandardStructObjectInspector(test, test2);
fieldNames.add("productCategory");
fieldObjectInspectors.add(PrimitiveObjectInspectorFactory.writableStringObjectInspector);
GenericUDF.DeferredObject firstDeferredObject = new
MyDeferredObject(test2);
GenericUDF extract_shas = new GenericUDFExtractShas();
extract_shas.initialize(new ObjectInspector[]{firstInspector});
extract_shas.evaluate(new DeferredObject[]{firstDeferredObject}); }
public class MyDeferredObject implements DeferredObject { private
Object value;
public MyDeferredObject(Object value) { this.value = value;
}
@Override public Object get() throws HiveException {
return value; } }
private class MyListObjectInspector implements ListObjectInspector {
@Override public ObjectInspector getListElementObjectInspector()
{ return
ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames,
fieldObjectInspectors); }
@Override public Object getListElement(Object data, int index)
{ List myList = (List) data; if (myList == null ||
index > myList.size()) { return null; }
return myList.get(index); }
@Override public int getListLength(Object data) {
if (data == null) { return -1; } return
((List) data).size(); }
@Override public List<?> getList(Object data) {
return (List) data; }
@Override public String getTypeName() { return
null; //To change body of implemented methods use File | Settings | File
Templates. }
@Override public Category getCategory() {
return Category.LIST; } }}