STORE finaldata INTO '$OUT' USING AvroStorage('schema_uri','$SCHEMA');
OUT=/user/pig/outputTest/20120422 that is the location where I would like
to get the final date store. And under the $SCHEMA variable I tried various
combination:
$SCHEMA=hdfs://namenodeha:8020/user/pig/outputTest/pif.json - that is the
avro schema I would like to use for store. The only difference should be
that the schema doesn't contain operator deduplicate (::)
Because that is the only problem in schema - the :: operator is used here
with no reason. Just need to get rid of that dirtydata::
finaldata: {*dirtydata::*Version: int,*dirtydata::*Dob: (Value: int)
*,dirtydata::*StoreId: chararray,*dirtydata::*TransactionBlockNumber: int,
*dirtydata::*TransactionData: {TransactionData: (TransactionHeader: (Dob:
(Value: int),StoreId: chararray,TransactionId: int,TransactionTime:
(UnixUtcTime: long,OffsetMinutes: int),TerminalId:
chararray,ResponsibleEmployees: (Employee: (Id: chararray,Name:
chararray),Manager: (Id: chararray,Name: chararray))),CustomData:
{KeyValue: (Key: chararray,Value: chararray)},StoreInfo: (IsQuickService:
boolean,CurrencyIsoCode: chararray),NewChecks: {NewCheckData: (CheckId:
chararray,CheckHeader: (CarriedOver: boolean,TerminalId:
chararray,Training: boolean,Period: (Id: chararray,Label:
chararray),GroupInfo: (Id: chararray,Label: (Id: chararray,Label:
chararray),IsTable: boolean),Events: {CheckEvent: (CustomEventLabel:
chararray,Time: (UnixUtcTime: long,OffsetMinutes: int),CheckEventType:
chararray)},CheckResponsibleEmployees: {CheckResponsibleEmployee:
(Employee: (Id: chararray,Name: chararray),Time: (UnixUtcTime:
long,OffsetMinutes: int))},GuestCounting: (Guests: (Value: chararray),Mode:
chararray),PrintedCheckId: chararray,RevenueCenter: (Id: chararray,Label:
chararray),Room: (Id: chararray,Label: chararray)))},Checks: {CheckData:
(CheckId: chararray,CheckHeaderUpdate: (Period: (Id: chararray,Label:
chararray),GroupInfo: (Id: chararray,Label: (Id: chararray,Label:
chararray),IsTable: boolean),Events: {CheckEvent: (CustomEventLabel:
chararray,Time: (UnixUtcTime: long,OffsetMinutes: int),CheckEventType:
chararray)},CheckResponsibleEmployees: {CheckResponsibleEmployee:
(Employee: (Id: chararray,Name: chararray),Time: (UnixUtcTime:
long,OffsetMinutes: int))},GuestCounting: (Guests: (Value: chararray),Mode:
chararray),PrintedCheckId: chararray,RevenueCenter: (Id: chararray,Label:
chararray),Room: (Id: chararray,Label: chararray)),Summary: (NetAmount:
(Value: chararray),Total: (Value: chararray)),CheckItems: {CheckItem:
(AbstractCheckElement: (Amount: (Value: chararray),ElementId:
chararray,ElementKind: (Id: chararray,Label: chararray),CreatedOn:
(UnixUtcTime: long,OffsetMinutes: int),ResponsibleEmployees: (Employee:
(Id: chararray,Name: chararray),Manager: (Id: chararray,Name:
chararray))),Categories: {Category: (CategoryInfo: (Id: chararray,Label:
chararray),Type: chararray)},ModifierInfo: (Label: (Id: chararray,Label:
chararray),ItemModifierInfoType: chararray),NetAmount: (Value:
chararray),OrderMode: (Id: chararray,Label: chararray),OriginalPrice:
(Value: chararray),ParentItem: chararray,Quantity: (Value:
chararray),Revenue: boolean,Seat: int,ProcessedInKitchen: boolean,GiftCard:
boolean,SplitItemElementId: chararray)},Comps: {CheckComp:
(AbstractCheckLinkedElement: (AbstractCheckElement: (Amount: (Value:
chararray),ElementId: chararray,ElementKind: (Id: chararray,Label:
chararray),CreatedOn: (UnixUtcTime: long,OffsetMinutes:
int),ResponsibleEmployees: (Employee: (Id: chararray,Name:
chararray),Manager: (Id: chararray,Name: chararray))),Items: {ItemAmount:
(Amount: (Value: chararray),ElementId: chararray)}),CheckCompType:
chararray,Note: chararray)},Payments: {CheckPayment: (AbstractCheckElement:
(Amount: (Value: chararray),ElementId: chararray,ElementKind: (Id:
chararray,Label: chararray),CreatedOn: (UnixUtcTime: long,OffsetMinutes:
int),ResponsibleEmployees: (Employee: (Id: chararray,Name:
chararray),Manager: (Id: chararray,Name: chararray))),ChangeBack: (Value:
chararray),DocumentId: chararray,Rounding: (Value: chararray),Tip: (Value:
chararray),CheckPaymentType: chararray,Card: chararray)},Promos:
{CheckPromo: (AbstractCheckLinkedElement: (AbstractCheckElement: (Amount:
(Value: chararray),ElementId: chararray,ElementKind: (Id: chararray,Label:
chararray),CreatedOn: (UnixUtcTime: long,OffsetMinutes:
int),ResponsibleEmployees: (Employee: (Id: chararray,Name:
chararray),Manager: (Id: chararray,Name: chararray))),Items: {ItemAmount:
(Amount: (Value: chararray),ElementId: chararray)}),Discount: (Value:
chararray),CheckPromoType: chararray)},Surcharges: {CheckSurcharge:
(AbstractCheckLinkedElement: (AbstractCheckElement: (Amount: (Value:
chararray),ElementId: chararray,ElementKind: (Id: chararray,Label:
chararray),CreatedOn: (UnixUtcTime: long,OffsetMinutes:
int),ResponsibleEmployees: (Employee: (Id: chararray,Name:
chararray),Manager: (Id: chararray,Name: chararray))),Items: {ItemAmount:
(Amount: (Value: chararray),ElementId: chararray)}),Rate: (Value:
chararray),CheckSurchargeType: chararray,Accounting: chararray)},Voids:
{CheckVoid: (AbstractCheckLinkedElement: (AbstractCheckElement: (Amount:
(Value: chararray),ElementId: chararray,ElementKind: (Id: chararray,Label:
chararray),CreatedOn: (UnixUtcTime: long,OffsetMinutes:
int),ResponsibleEmployees: (Employee: (Id: chararray,Name:
chararray),Manager: (Id: chararray,Name: chararray))),Items: {ItemAmount:
(Amount: (Value: chararray),ElementId: chararray)}),CheckVoidType:
chararray,Note: chararray)},RemovedElements: {RemovedElement: (ElementId:
chararray,RemovedElementType: chararray)})},LaborData: {LaborData: (Shifts:
{Shift: (State: chararray,StartDate: (UnixUtcTime: long,OffsetMinutes:
int),EndDate: (UnixUtcTime: long,OffsetMinutes: int),TotalPay: (Value:
chararray),PayRates: {ShiftPayRate: (AfterHours: int,HourlyRate: (Value:
chararray),IsOvertime: boolean)},ShiftNumber: int,Job: (Id:
chararray,Label: chararray),Breaks: {Break: (Paid: boolean,StartDate:
(UnixUtcTime: long,OffsetMinutes: int),EndDate: (UnixUtcTime:
long,OffsetMinutes: int))},IsManager: boolean)},Employee: (Id:
chararray,Name: chararray))})},*dirtydata::Created*: (UnixUtcTime:
long,OffsetMinutes: int)}
On 15 October 2014 19:00, Serega Sheypak <[email protected]> wrote:
> what are values for these variables:
> STORE finaldata INTO '$OUT' USING AvroStorage('schema_uri','$SCHEMA');
>
> 2014-10-15 17:51 GMT+04:00 Jakub Stransky <[email protected]>:
>
> > No_schema_check doesn't help. Essentially we need either to remove
> relation
> > name or to ensure that schema is used during store. Here it seems that
> even
> > schema is supplied the internal schema take precedence. And that causes
> > problems
> >
> > On 15 October 2014 15:41, praveenesh kumar <[email protected]> wrote:
> >
> > > Not really sure, but can you try adding 'no_schema_check'while using
> > > AvroStorage in Store function.
> > >
> > > On Wed, Oct 15, 2014 at 1:59 PM, Jakub Stransky <[email protected]
> >
> > > wrote:
> > >
> > > > Hello experienced users,
> > > >
> > > > I am working with avro data files using AvroStorage and I am facing
> > > > following issue. I cannot store the data of my result back to avro
> data
> > > > file.
> > > >
> > > > I have following script
> > > > inputdata = load '$INP' using AvroStorage();
> > > > dirtydata = DISTINCT inputdata;
> > > > sodtr = FILTER dirtydata BY TransactionBlockNumber == 1;
> > > > sto = FOREACH sodtr GENERATE Dob.Value AS Dob,StoreId,
> > > > Created.UnixUtcTime;
> > > > g = GROUP sto BY (Dob,StoreId);
> > > > sodtime = FOREACH g GENERATE group.Dob AS Dob, group.StoreId AS
> > StoreId,
> > > > MAX(sto.UnixUtcTime) AS latestStartOfDayTime;
> > > >
> > > > joined = JOIN dirtydata BY (Dob.Value, StoreId) LEFT OUTER, sodtime
> BY
> > > > (Dob, StoreId);
> > > >
> > > > cleandata = FILTER joined BY dirtydata::Created.UnixUtcTime >=
> > > > sodtime.latestStartOfDayTime; --1412864846
> > > > finaldata = FOREACH cleandata GENERATE dirtydata::Version ..
> > > > dirtydata::Created;
> > > >
> > > > STORE finaldata INTO '$OUT' USING
> AvroStorage('schema_uri','$SCHEMA');
> > > >
> > > > Where $SCHEMA contains exactly the same schema as inputdata. By pig
> > > > operations I got several nested relation, columns etc. Those should
> be
> > > > removed by .. operator. Resulting schema using describe
> > > >
> > > >
> > > > finaldata: {dirtydata*::*Version: int,dirtydata::Dob: (Value:
> > > > int),dirtydata::StoreId: chararray,dirtydata::TransactionBlockNumber:
> > > > int,dirtydata::TransactionData: {TransactionData: (TransactionHeader:
> > > (Dob:
> > > > (Value: int),StoreId: chararray,TransactionId: int,TransactionTime:
> > > > (UnixUtcTime: long,OffsetMinutes: int),TerminalId:
> > > > chararray,ResponsibleEmployees: (Employee: (Id: chararray,Name:
> > > > chararray),Manager: (Id: chararray,Name: chararray))),CustomData:
> > > > {KeyValue: (Key: chararray,Value: chararray)},StoreInfo:
> > (IsQuickService:
> > > > boolean,CurrencyIsoCode: chararray),NewChecks: {NewCheckData:
> (CheckId:
> > > > chararray,CheckHeader: (CarriedOver: boolean,TerminalId:
> > > > chararray,Training: boolean,Period: (Id: chararray,Label:
> > > > chararray),GroupInfo: (Id: chararray,Label: (Id: chararray,Label:
> > > > chararray),IsTable: boolean),Events: {CheckEvent: (CustomEventLabel:
> > > > chararray,Time: (UnixUtcTime: long,OffsetMinutes:
> int),CheckEventType:
> > > > chararray)},CheckResponsibleEmployees: {CheckResponsibleEmployee:
> > > > (Employee: (Id: chararray,Name: chararray),Time: (UnixUtcTime:
> > > > long,OffsetMinutes: int))},GuestCounting: (Guests: (Value:
> > > chararray),Mode:
> > > > chararray),PrintedCheckId: chararray,RevenueCenter: (Id:
> > chararray,Label:
> > > > chararray),Room: (Id: chararray,Label: chararray)))},Checks:
> > {CheckData:
> > > > (CheckId: chararray,CheckHeaderUpdate: (Period: (Id: chararray,Label:
> > > > chararray),GroupInfo: (Id: chararray,Label: (Id: chararray,Label:
> > > > chararray),IsTable: boolean),Events: {CheckEvent: (CustomEventLabel:
> > > > chararray,Time: (UnixUtcTime: long,OffsetMinutes:
> int),CheckEventType:
> > > > chararray)},CheckResponsibleEmployees: {CheckResponsibleEmployee:
> > > > (Employee: (Id: chararray,Name: chararray),Time: (UnixUtcTime:
> > > > long,OffsetMinutes: int))},GuestCounting: (Guests: (Value:
> > > chararray),Mode:
> > > > chararray),PrintedCheckId: chararray,RevenueCenter: (Id:
> > chararray,Label:
> > > > chararray),Room: (Id: chararray,Label: chararray)),Summary:
> (NetAmount:
> > > > (Value: chararray),Total: (Value: chararray)),CheckItems: {CheckItem:
> > > > (AbstractCheckElement: (Amount: (Value: chararray),ElementId:
> > > > chararray,ElementKind: (Id: chararray,Label: chararray),CreatedOn:
> > > > (UnixUtcTime: long,OffsetMinutes: int),ResponsibleEmployees:
> (Employee:
> > > > (Id: chararray,Name: chararray),Manager: (Id: chararray,Name:
> > > > chararray))),Categories: {Category: (CategoryInfo: (Id:
> > chararray,Label:
> > > > chararray),Type: chararray)},ModifierInfo: (Label: (Id:
> > chararray,Label:
> > > > chararray),ItemModifierInfoType: chararray),NetAmount: (Value:
> > > > chararray),OrderMode: (Id: chararray,Label: chararray),OriginalPrice:
> > > > (Value: chararray),ParentItem: chararray,Quantity: (Value:
> > > > chararray),Revenue: boolean,Seat: int,ProcessedInKitchen:
> > > boolean,GiftCard:
> > > > boolean,SplitItemElementId: chararray)},Comps: {CheckComp:
> > > > (AbstractCheckLinkedElement: (AbstractCheckElement: (Amount: (Value:
> > > > chararray),ElementId: chararray,ElementKind: (Id: chararray,Label:
> > > > chararray),CreatedOn: (UnixUtcTime: long,OffsetMinutes:
> > > > int),ResponsibleEmployees: (Employee: (Id: chararray,Name:
> > > > chararray),Manager: (Id: chararray,Name: chararray))),Items:
> > {ItemAmount:
> > > > (Amount: (Value: chararray),ElementId: chararray)}),CheckCompType:
> > > > chararray,Note: chararray)},Payments: {CheckPayment:
> > > (AbstractCheckElement:
> > > > (Amount: (Value: chararray),ElementId: chararray,ElementKind: (Id:
> > > > chararray,Label: chararray),CreatedOn: (UnixUtcTime:
> > long,OffsetMinutes:
> > > > int),ResponsibleEmployees: (Employee: (Id: chararray,Name:
> > > > chararray),Manager: (Id: chararray,Name: chararray))),ChangeBack:
> > (Value:
> > > > chararray),DocumentId: chararray,Rounding: (Value: chararray),Tip:
> > > (Value:
> > > > chararray),CheckPaymentType: chararray,Card: chararray)},Promos:
> > > > {CheckPromo: (AbstractCheckLinkedElement: (AbstractCheckElement:
> > (Amount:
> > > > (Value: chararray),ElementId: chararray,ElementKind: (Id:
> > > chararray,Label:
> > > > chararray),CreatedOn: (UnixUtcTime: long,OffsetMinutes:
> > > > int),ResponsibleEmployees: (Employee: (Id: chararray,Name:
> > > > chararray),Manager: (Id: chararray,Name: chararray))),Items:
> > {ItemAmount:
> > > > (Amount: (Value: chararray),ElementId: chararray)}),Discount: (Value:
> > > > chararray),CheckPromoType: chararray)},Surcharges: {CheckSurcharge:
> > > > (AbstractCheckLinkedElement: (AbstractCheckElement: (Amount: (Value:
> > > > chararray),ElementId: chararray,ElementKind: (Id: chararray,Label:
> > > > chararray),CreatedOn: (UnixUtcTime: long,OffsetMinutes:
> > > > int),ResponsibleEmployees: (Employee: (Id: chararray,Name:
> > > > chararray),Manager: (Id: chararray,Name: chararray))),Items:
> > {ItemAmount:
> > > > (Amount: (Value: chararray),ElementId: chararray)}),Rate: (Value:
> > > > chararray),CheckSurchargeType: chararray,Accounting:
> chararray)},Voids:
> > > > {CheckVoid: (AbstractCheckLinkedElement: (AbstractCheckElement:
> > (Amount:
> > > > (Value: chararray),ElementId: chararray,ElementKind: (Id:
> > > chararray,Label:
> > > > chararray),CreatedOn: (UnixUtcTime: long,OffsetMinutes:
> > > > int),ResponsibleEmployees: (Employee: (Id: chararray,Name:
> > > > chararray),Manager: (Id: chararray,Name: chararray))),Items:
> > {ItemAmount:
> > > > (Amount: (Value: chararray),ElementId: chararray)}),CheckVoidType:
> > > > chararray,Note: chararray)},RemovedElements: {RemovedElement:
> > (ElementId:
> > > > chararray,RemovedElementType: chararray)})},LaborData: {LaborData:
> > > (Shifts:
> > > > {Shift: (State: chararray,StartDate: (UnixUtcTime:
> long,OffsetMinutes:
> > > > int),EndDate: (UnixUtcTime: long,OffsetMinutes: int),TotalPay:
> (Value:
> > > > chararray),PayRates: {ShiftPayRate: (AfterHours: int,HourlyRate:
> > (Value:
> > > > chararray),IsOvertime: boolean)},ShiftNumber: int,Job: (Id:
> > > > chararray,Label: chararray),Breaks: {Break: (Paid: boolean,StartDate:
> > > > (UnixUtcTime: long,OffsetMinutes: int),EndDate: (UnixUtcTime:
> > > > long,OffsetMinutes: int))},IsManager: boolean)},Employee: (Id:
> > > > chararray,Name: chararray))})},dirtydata::Created: (UnixUtcTime:
> > > > long,OffsetMinutes: int)}
> > > >
> > > > *I am getting error: Pig Schema contains a name that is not allowed
> in
> > > > Avro. Which is probably because of :: remains for dirtydata. Is
> there a
> > > way
> > > > how to strip this off (as now there is no point being there)
> otherwise
> > > > schema should be identical to input schema.*
> > > >
> > > > *Thanks for helping me out*
> > > > *Jakub*
> > > >
> > >
> >
> >
> >
> > --
> > Jakub Stransky
> > cz.linkedin.com/in/jakubstransky
> >
>
--
Jakub Stransky
cz.linkedin.com/in/jakubstransky