I am trying to write a GenericUDF function to collect all of a specific struct 
field(s) within an array for each record, and return them in an array as well.
I wrote the UDF (as below), and it seems to work but:
1) It does not work when I am performing this on an external table, it works 
fine on a managed table, any idea?
2) I am having a tough time writing a test on this.  I have attached the test I 
have so far, and it does not work, always getting 'java.util.ArrayList cannot 
be cast to org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector' or 
cannot cast String to LazyString', my question is how do I supply a list of 
structs for the evalue method?
Any help will be greatly appreciated.
Thanks,Peter
The table:
CREATE EXTERNAL TABLE FOO (    TS string,    customerId string,    products 
array< struct<productCategory:string> >  )  PARTITIONED BY (ds string)  ROW 
FORMAT SERDE 'some.serde'  WITH SERDEPROPERTIES ('error.ignore'='true')  
LOCATION 'some_locations'  ;
A row of record holds:1340321132000, 'some_company', 
[{"productCategory":"footwear"},{"productCategory":"eyewear"}]
This is my code:
import org.apache.hadoop.hive.ql.exec.Description;import 
org.apache.hadoop.hive.ql.exec.UDFArgumentException;import 
org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;import 
org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;import 
org.apache.hadoop.hive.ql.metadata.HiveException;import 
org.apache.hadoop.hive.ql.udf.generic.GenericUDF;import 
org.apache.hadoop.hive.serde2.lazy.LazyString;import 
org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;import 
org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;import 
org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;import 
org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;import 
org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;import 
org.apache.hadoop.hive.serde2.objectinspector.StructField;import 
org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;import
 
org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;import
 org.apache.hadoop.io.Text;
import java.util.ArrayList;
@Description(name = "extract_product_category",        value = "_FUNC_( array< 
struct<sha256:string> > ) - Collect all product category field values inside an 
array of struct(s), and return the results in an array<string>",        
extended = "Example:\n SELECT 
_FUNC_(array_of_structs_with_product_category_field)")public class 
GenericUDFExtractProductCategory        extends GenericUDF{    private 
ArrayList ret;
    private ListObjectInspector listOI;    private StructObjectInspector 
structOI;    private ObjectInspector prodCatOI;
    @Override    public ObjectInspector initialize(ObjectInspector[] args)      
      throws UDFArgumentException    {        if (args.length != 1) {           
 throw new UDFArgumentLengthException("The function extract_product_category() 
requires exactly one argument.");        }
        if (args[0].getCategory() != Category.LIST) {            throw new 
UDFArgumentTypeException(0, "Type array<struct> is expected to be the argument 
for extract_product_category but " + args[0].getTypeName() + " is found 
instead");        }
        listOI = ((ListObjectInspector) args[0]);        structOI = 
((StructObjectInspector) listOI.getListElementObjectInspector());
        if (structOI.getAllStructFieldRefs().size() != 1) {            throw 
new UDFArgumentTypeException(0, "Incorrect number of fields in the struct, 
should be one");        }
        StructField productCategoryField = 
structOI.getStructFieldRef("productCategory");        //If not, throw exception 
       if (productCategoryField == null) {            throw new 
UDFArgumentTypeException(0, "NO \"productCategory\" field in input structure"); 
       }
        //Are they of the correct types?        //We store these object 
inspectors for use in the evaluate() method        prodCatOI = 
productCategoryField.getFieldObjectInspector();
        //First are they primitives        if (prodCatOI.getCategory() != 
Category.PRIMITIVE) {            throw new UDFArgumentTypeException(0, 
"productCategory field must be of string type");        }
        //Are they of the correct primitives?        if 
(((PrimitiveObjectInspector)prodCatOI).getPrimitiveCategory() != 
PrimitiveObjectInspector.PrimitiveCategory.STRING) {            throw new 
UDFArgumentTypeException(0, "productCategory field must be of string type");    
    }
        ret = new ArrayList();
        return 
ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableStringObjectInspector);
    }
    @Override    public ArrayList evaluate(DeferredObject[] arguments)          
  throws HiveException    {        ret.clear();
        if (arguments.length != 1) {            return null;        }
        if (arguments[0].get() == null) {               return null;        }
        int numElements = listOI.getListLength(arguments[0].get());
        for (int i = 0; i < numElements; i++) {            LazyString 
prodCatDataObject = (LazyString) 
(structOI.getStructFieldData(listOI.getListElement(arguments[0].get(), i), 
structOI.getStructFieldRef("productCategory")));            Text 
productCategoryValue = ((StringObjectInspector) 
prodCatOI).getPrimitiveWritableObject(prodCatDataObject);            
ret.add(productCategoryValue);        }        return ret;    }
    @Override    public String getDisplayString(String[] strings)    {        
assert (strings.length > 0);        StringBuilder sb = new StringBuilder();     
   sb.append("extract_product_category(");        sb.append(strings[0]);        
sb.append(")");        return sb.toString();    }}

My Test:
import org.apache.hadoop.hive.ql.metadata.HiveException;import 
org.apache.hadoop.hive.ql.udf.generic.GenericUDF;import 
org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject;import 
org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;import 
org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;import 
org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;import 
org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;import
 org.testng.annotations.Test;
import java.util.ArrayList;import java.util.List;
public class TestGenericUDFExtractShas{    ArrayList<String> fieldNames = new 
ArrayList<String>();    ArrayList<ObjectInspector> fieldObjectInspectors = new 
ArrayList<ObjectInspector>();
    @Test    public void simpleTest()        throws Exception    {        
ListObjectInspector firstInspector = new MyListObjectInspector();
        ArrayList test = new ArrayList();        test.add("test");
        ArrayList test2 = new ArrayList();        test2.add(test);
        StructObjectInspector soi = 
ObjectInspectorFactory.getStandardStructObjectInspector(test, test2);
        fieldNames.add("productCategory");        
fieldObjectInspectors.add(PrimitiveObjectInspectorFactory.writableStringObjectInspector);
        GenericUDF.DeferredObject firstDeferredObject = new 
MyDeferredObject(test2);
        GenericUDF extract_shas = new GenericUDFExtractShas();
        extract_shas.initialize(new ObjectInspector[]{firstInspector});
        extract_shas.evaluate(new DeferredObject[]{firstDeferredObject});    }
    public class MyDeferredObject implements DeferredObject    {        private 
Object value;
        public MyDeferredObject(Object value) {            this.value = value;  
      }
        @Override        public Object get() throws HiveException        {      
      return value;        }    }
    private class MyListObjectInspector implements ListObjectInspector    {     
   @Override        public ObjectInspector getListElementObjectInspector()      
  {            return 
ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, 
fieldObjectInspectors);        }
        @Override        public Object getListElement(Object data, int index)   
     {            List myList = (List) data;            if (myList == null || 
index > myList.size()) {                return null;            }            
return myList.get(index);        }
        @Override        public int getListLength(Object data)        {         
   if (data == null) {                return -1;            }            return 
((List) data).size();        }
        @Override        public List<?> getList(Object data)        {           
 return (List) data;        }
        @Override        public String getTypeName()        {            return 
null;  //To change body of implemented methods use File | Settings | File 
Templates.        }
        @Override        public Category getCategory()        {            
return Category.LIST;        }    }}                                           

Reply via email to