Hi!
I have left some comments in the code (lines starting with /// ) + removed
the md5 implementation parts to make the answer shorter.
Note that I am not sure about the goal you want to achieve with the UDA -
can you explain what countMD5 would be used for?
> void md5(const unsigned char message[], int len, unsigned char result[])
> {
>
...
> memcpy(result, r, sizeof(int) * 4);
>
...
> }
>
> void init_func(FunctionContext* context, StringVal* val) {
> val->is_null = true;
> }
> void update_func(FunctionContext* context, const StringVal& str,
> StringVal* result) {
> if (str.is_null) return;
> if (result->is_null) {
>
> unsigned char *outbuf=context->Allocate(17);
> outbuf[16]='\0';
> md5(str.ptr, str.len, outbuf);
>
> uint8_t* copy = context->Allocate(17);
>
> if (copy == NULL) return;
> memcpy(copy, outbuf, 16);
> context->Free(outbuf);
> *result = StringVal(copy, str.len);
>
/// str.len: my understanding is that the hash is always 16 byte, so it
should be fix 16 instead (or 17 if \0 terminated)
> return;
> }
> unsigned char *outbuf1=context->Allocate(17);
>
/// no Free is called on outbuf - note that an array on stack would be also
good as buffer
> outbuf1[16]='\0';
> md5(str.ptr, sizeof(str.ptr), outbuf1);
>
/// sizeof(str.ptr): this will be always 8 - shouldn't it be str.len?
> uint8_t* copy1 = context->Allocate(17);
>
> for(int i=0;i<16;i++)
> {
> copy1[i]=outbuf1[i] & result->ptr[i];
>
/// using & operator above means that result will contain less and less 1
bits, so it will converge to 0 - is this intentional?
> }
>
> *result = StringVal(copy1, 17);
> return;
>
> }
> void merge_func(FunctionContext* context, const StringVal& src, StringVal*
> dst) {
> if (src.is_null) return;
> for(int i=0;i<16;i++)
> {
> dst->ptr[i]=src.ptr[i] & dst->ptr[i];
>
/// same as my last comment: this will converge to 0 if there are a lot of
distinct values
> }
> }
>
> StringVal serialize_func(FunctionContext* context, const StringVal& val) {
> if (val.is_null) return val;
> unsigned char *outbuf1=context->Allocate(17);
> outbuf1[16]='\0';
>
/// outbuf is not freed - it is actually not used at all
> uint8_t* copy = context->Allocate(val.len);
> memcpy(copy, val.ptr, 17);
> return StringVal(copy,17);
> }
>
> StringVal finalize_func(FunctionContext* context, const StringVal& val) {
> if (val.is_null) return val;
> unsigned char *outbuf1=context->Allocate(17);
> outbuf1[16]='\0';
>
/// outbuf is not freed - it is actually not used at all
> uint8_t* copy = context->Allocate(val.len);
> memcpy(copy, val.ptr, 17);
> return StringVal(copy,17);
> }
>
>
>
>
>
>
> ########################
> ########################
> define function SQL in impala-shell:
> create aggregate function countMD5(string) returns string location
> 'hdfs://nameservice1:8020//user/hive/udfjars/libmd5udaf.so'
> init_fn='init_func' update_fn='update_func' merge_fn='merge_func'
> serialize_fn='serialize_func' finalize_fn='finalize_func';
>
>
> Maybe my C++ code has some problems, could you help me?
>