[ https://issues.apache.org/jira/browse/HAWQ-1324?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Ruilong Huo updated HAWQ-1324: ------------------------------ Fix Version/s: (was: backlog) 2.1.0.0-incubating > Query cancel cause segment to go into Crash recovery > ---------------------------------------------------- > > Key: HAWQ-1324 > URL: https://issues.apache.org/jira/browse/HAWQ-1324 > Project: Apache HAWQ > Issue Type: Bug > Components: Query Execution > Affects Versions: 2.0.0.0-incubating > Reporter: Ming LI > Assignee: Ming LI > Fix For: 2.1.0.0-incubating > > > A query was cancelled due to this connection issue to HDFS on Isilon. Seg26 > then went into crash recovery due to a INSERT query being cancelled. What > should be the expected behaviour when HDFS becomes unavailable and a Query > fails due to HDFS unavailability. > Below is the HDFS error > {code} > 2017-01-04 03:04:08.382615 > JST,"carund","dwhrun",p574246,th1862944896,"192.168.10.12","47554",2017-01-04 > 03:03:08 JST,0,con198952,,seg29,,,,,"FATAL","08006","connection to client > lost",,,,,,,0,,"postgres.c",3518, > 2017-01-04 03:04:08.420099 > JST,,,p755778,th1862944896,,,,0,,,seg-10000,,,,,"LOG","00000","3rd party > error log: > 2017-01-04 03:04:08.419969, p574222, th140507423066240, ERROR Handle > Exception: NamenodeImpl.cpp: 670: Unexpected error: status: > STATUS_FILE_NOT_AVAILABLE = 0xC0000467 Path: > hawq_default/16385/16563/802748/26 with path= > ""/hawq_default/16385/16563/802748/26"", > clientname=libhdfs3_client_random_866998528_count_1_pid_574222_tid_140507423066240 > @ Hdfs::Internal::UnWrapper<Hdfs::UnresolvedLinkException, > Hdfs::HdfsIOException, Hdfs::Internal::Nothing, Hdfs::Internal::Nothing, > Hdfs::Internal::Nothing, Hdfs::Internal::Nothing, Hdfs::Internal::Nothing , > Hdfs::Internal::Nothing, Hdfs::Internal::Nothing, Hdfs::Internal::Nothing, > Hdfs::Internal::Nothing>::unwrap(char const, int) > @ Hdfs::Internal::UnWrapper<Hdfs::FileNotFoundException, > Hdfs::UnresolvedLinkException, Hdfs::HdfsIOException, > Hdfs::Internal::Nothing, Hdfs::Internal::Nothing, Hdfs::Internal::Nothing, > Hdfs::Internal::Not hing, Hdfs::Internal::Nothing, Hdfs::Internal::Nothing, > Hdfs::Internal::Nothing, Hdfs::Internal::Nothing>::unwrap(char const, int) > @ Hdfs::Internal::NamenodeImpl::fsync(std::string const&, std::string const&) > @ Hdfs::Internal::NamenodeProxy::fsync(std::string const&, std::string const&) > @ Hdfs::Internal::OutputStreamImpl::closePipeline() > @ Hdfs::Internal::OutputStreamImpl::close() > @ hdfsCloseFile > @ gpfs_hdfs_closefile > @ HdfsCloseFile > @ HdfsFileClose > @ CleanupTempFiles > @ AbortTransaction > @ AbortCurrentTransaction > @ PostgresMain > @ BackendStartup > @ ServerLoop > @ PostmasterMain > @ main > @ Unknown > @ Unknown",,,,,,,,"SysLoggerMain","syslogger.c",518, > 2017-01-04 03:04:08.420272 > JST,"carund","dwhrun",p574222,th1862944896,"192.168.10.12","47550",2017-01-04 > 03:03:08 > JST,40678725,con198952,cmd4,seg25,,,x40678725,sx1,"WARNING","58030","could > not close file 7 : (hdfs://ffd > lakehd.ffwin.fujifilm.co.jp:8020/hawq_default/16385/16563/802748/26) errno > 5","Unexpected error: status: STATUS_FILE_NOT_AVAILABLE = 0xC0000467 Path: > hawq_default/16385/16563/802748/26 with path=""/hawq_default/16385/16 > 563/802748/26"", > clientname=libhdfs3_client_random_866998528_count_1_pid_574222_tid_140507423066240",,,,,,0,,"fd.c",2762, > {code} > Segment 26 going into Crash recovery - from seg26 log file > {code} > 2017-01-04 03:04:08.420314 > JST,"carund","dwhrun",p574222,th1862944896,"192.168.10.12","47550",2017-01-04 > 03:03:08 > JST,40678725,con198952,cmd4,seg25,,,x40678725,sx1,"LOG","08006","could not > send data to client: 接続が相 > 手からリセットされました",,,,,,,0,,"pqcomm.c",1292, > 2017-01-04 03:04:08.420358 > JST,"carund","dwhrun",p574222,th1862944896,"192.168.10.12","47550",2017-01-04 > 03:03:08 JST,0,con198952,,seg25,,,,,"LOG","08006","could not send data to > client: パイプが切断されました",,,,,,,0, > ,"pqcomm.c",1292, > 2017-01-04 03:04:08.420375 > JST,"carund","dwhrun",p574222,th1862944896,"192.168.10.12","47550",2017-01-04 > 03:03:08 JST,0,con198952,,seg25,,,,,"FATAL","08006","connection to client > lost",,,,,,,0,,"postgres.c",3518, > 2017-01-04 03:04:08.950354 > JST,,,p755773,th1862944896,,,,0,,,seg-10000,,,,,"LOG","00000","server process > (PID 574240) was terminated by signal 11: Segmentation > fault",,,,,,,0,,"postmaster.c",4748, > 2017-01-04 03:04:08.950403 > JST,,,p755773,th1862944896,,,,0,,,seg-10000,,,,,"LOG","00000","terminating > any other active server processes",,,,,,,0,,"postmaster.c",4486, > 2017-01-04 03:04:08.954044 > JST,,,p41605,th1862944896,,,,0,,,seg-10000,,,,,"LOG","00000","Segment RM > exits.",,,,,,,0,,"resourcemanager.c",340, > 2017-01-04 03:04:08.954078 > JST,,,p41605,th1862944896,,,,0,,,seg-10000,,,,,"LOG","00000","Clean up > handler in message server is called.",,,,,,,0,,"rmcomm_MessageServer.c",105, > 2017-01-04 03:04:08.972706 > JST,,,p574711,th1862944896,"192.168.10.12","48121",2017-01-04 03:04:08 > JST,0,,,seg-10000,,,,,"LOG","00000","PID 574308 in cancel request did not > match any process",,,,,,,0,,"postmaster.c",3166 > , > 2017-01-04 03:04:08.976211 > JST,,,p574712,th1862944896,"192.168.10.12","48127",2017-01-04 03:04:08 > JST,0,,,seg-10000,,,,,"LOG","00000","PID 574320 in cancel request did not > match any process",,,,,,,0,,"postmaster.c",3166, > 2017-01-04 03:04:08.979680 > JST,,,p574713,th1862944896,"192.168.10.12","48133",2017-01-04 03:04:08 > JST,0,,,seg-10000,,,,,"LOG","00000","PID 574326 in cancel request did not > match any process",,,,,,,0,,"postmaster.c",3166, > 2017-01-04 03:04:08.998035 > JST,,,p755773,th1862944896,,,,0,,,seg-10000,,,,,"LOG","00000","BeginResetOfPostmasterAfterChildrenAreShutDown: > counter 1",,,,,,,0,,"postmaster.c",1902, > 2017-01-04 03:04:08.998062 > JST,,,p755773,th1862944896,,,,0,,,seg-10000,,,,,"LOG","00000","gp_session_id > high-water mark is 565729",,,,,,,0,,"postmaster.c",1926, > 2017-01-04 03:04:09.076801 > JST,,,p755773,th1862944896,,,,0,,,seg-10000,,,,,"LOG","00000","resetting > shared memory",,,,,,,0,,"postmaster.c",3334, > 2017-01-04 03:04:09.076835 > JST,,,p755773,th1862944896,,,,0,,,seg-10000,,,,,"LOG","00000","PersistentFilespace_ShmemSize: > 69192 = gp_max_filespaces: 8 sizeof(FilespaceDirEntryData): 1048 + > PersistentFilespace_SharedDataSize(): > 80",,,,,,,0,,"cdbpersistentfilespace.c",1144, > 2017-01-04 03:04:09.076844 > JST,,,p755773,th1862944896,,,,0,,,seg-10000,,,,,"LOG","00000","PersistentTablespace_ShmemSize: > 6304 = gp_max_tablespaces: 16 sizeof(TablespaceDirEntryData): 32 + > PersistentTablespace_SharedDataSize(): > 80",,,,,,,0,,"cdbpersistenttablespace.c",1192, > 2017-01-04 03:04:09.076851 > JST,,,p755773,th1862944896,,,,0,,,seg-10000,,,,,"LOG","00000","PersistentDatabase_ShmemSize: > 15984 = PersistentDatabase_SharedDataSize(): 15984 = > PersistentDatabaseSharedData: 80 + MaxPersistentDatabaseDirectories: 256 (db: > 16 ts: 16) sizeof(DatabaseDirEntryData): > 56",,,,,,,0,,"cdbpersistentdatabase.c",1477, > 2017-01-04 03:04:09.076868 > JST,,,p755773,th1862944896,,,,0,,,seg-10000,,,,,"LOG","00000","PersistentRelation_ShmemSize: > 3673504 = gp_max_relations: 65536 * sizeof(RelationDirEntryData): 32 + > PersistentRelation_SharedDataSize(): > 80",,,,,,,0,,"cdbpersistentrelation.c",454, > 2017-01-04 03:04:09.076876 > JST,,,p755773,th1862944896,,,,0,,,seg-10000,,,,,"LOG","00000","Metadata Cache > Share Memory Size : 155720180",,,,,,,0,,"ipci.c",175, > 2017-01-04 03:04:09.077068 > JST,,,p755773,th1862944896,,,,0,,,seg-10000,,,,,"LOG","00000","temporary > files using default location",,,,,,,0,,"primary_mirror_mode.c",282, > 2017-01-04 03:04:09.077081 > JST,,,p755773,th1862944896,,,,0,,,seg-10000,,,,,"LOG","00000","transaction > files using default pg_system > filespace",,,,,,,0,,"primary_mirror_mode.c",1133, > 2017-01-04 03:04:09.255557 > JST,,,p755773,th1862944896,,,,0,,,seg-10000,,,,,"LOG","00000","[MetadataCache] > Metadata cache initialize successfully. > block_capacity:2097152",,,,,,,0,,"cdbmetadatacache.c",248, > 2017-01-04 03:04:09.266262 > JST,,,p755773,th1862944896,,,,0,,,seg-10000,,,,,"LOG","00000","PrimaryMirrorMode: > Processing postmaster reset with recent mode of > 3",,,,,,,0,,"primary_mirror_mode.c",877, > 2017-01-04 03:04:09.266286 > JST,,,p755773,th1862944896,,,,0,,,seg-10000,,,,,"LOG","00000","PrimaryMirrorMode: > Processing postmaster reset to non-fault > state",,,,,,,0,,"primary_mirror_mode.c",885, > 2017-01-04 03:04:09.266293 > JST,,,p755773,th1862944896,,,,0,,,seg-10000,,,,,"LOG","00000","removing all > temporary files",,,,,,,0,,"fd.c",2194, > {code} > There was a core file generated at the time of the Crash recovery. Below is > output from the core file > {code} > ---from the core file on segment for PID 574240 > #0 0x0000003e7780f867 in ?? () from > /data/logs/46429/packcore-core.f418p206.574240.postgres.1483466648/lib64/libgcc_s.so.1 > #1 0x0000003e77810119 in _Unwind_Backtrace () from > /data/logs/46429/packcore-core.f418p206.574240.postgres.1483466648/lib64/libgcc_s.so.1 > #2 0x0000003e75cfebf6 in backtrace () from > /data/logs/46429/packcore-core.f418p206.574240.postgres.1483466648/lib64/libc.so.6 > #3 0x000000000086e01e in gp_backtrace (processName=<value optimized out>, > postgres_signal_arg=11) at elog.c:4374 > #4 SafeHandlerForSegvBusIll (processName=<value optimized out>, > postgres_signal_arg=11) at elog.c:4510 > #5 <signal handler called> > #6 0x0000003e7780f867 in ?? () from > /data/logs/46429/packcore-core.f418p206.574240.postgres.1483466648/lib64/libgcc_s.so.1 > #7 0x0000003e77810119 in _Unwind_Backtrace () from > /data/logs/46429/packcore-core.f418p206.574240.postgres.1483466648/lib64/libgcc_s.so.1 > #8 0x0000003e75cfebf6 in backtrace () from > /data/logs/46429/packcore-core.f418p206.574240.postgres.1483466648/lib64/libc.so.6 > #9 0x000000000087212f in errstart (elevel=15, filename=0xb29fa8 "postgres.c", > lineno=3523, funcname=0xb25d00 "ProcessInterrupts", domain=<value optimized > out>) at elog.c:492 > #10 0x0000000000872819 in elog_finish (elevel=15, fmt=0xb28710 "Process > interrupt for 'query cancel pending'.") at elog.c:1443 > #11 0x00000000007abfed in ProcessInterrupts () at postgres.c:3523 > #12 0x00000000007ac3c6 in StatementCancelHandler (postgres_signal_arg=<value > optimized out>) at postgres.c:3376 > #13 <signal handler called> > #14 0x00000000004cbfab in varattrib_untoast_ptr_len (d=33352720, > datastart=<value optimized out>, len=<value optimized out>, tofree=<value > optimized out>) at tuptoaster.c:322 > #15 0x0000000000000002 in ?? () at > /opt/gcc-4.4.2/lib/gcc/x86_64-unknown-linux-gnu/4.4.2/../../../../include/c++/4.4.2/bits/vector.tcc:372 > #16 0x00007fff0ff8c2b0 in ?? () > #17 0x000000000065f65d in ExecMakeFunctionResultNoSets (fcache=0x50ff8c5c2, > econtext=0x1fcec14, isNull=<value optimized out>, isDone=<value optimized > out>) at execQual.c:1831 > #18 0x0000000000664d4a in ExecTargetList (projInfo=<value optimized out>, > isDone=<value optimized out>) at execQual.c:5467 > #19 ExecProject (projInfo=<value optimized out>, isDone=<value optimized > out>) at execQual.c:5653 > #20 0x0000000000685641 in ExecResult (node=0x1f95ae0) at nodeResult.c:222 > #21 0x000000000065e0b4 in ExecProcNode (node=0x1f95ae0) at execProcnode.c:891 > #22 0x00000000006855c5 in NextInputSlot (node=0x1f671e0) at nodeResult.c:95 > #23 ExecResult (node=0x1f671e0) at nodeResult.c:194 > #24 0x000000000065e0b4 in ExecProcNode (node=0x1f671e0) at execProcnode.c:891 > #25 0x000000000069d4a2 in ExecDML (node=0x1f62f08) at nodeDML.c:65 > #26 0x000000000065e295 in ExecProcNode (node=0x1f62f08) at execProcnode.c:1014 > #27 0x000000000065a05a in ExecutePlan (estate=0x1f62bb0, planstate=<value > optimized out>, operation=<value optimized out>, numberTuples=<value > optimized out>, direction=<value optimized out>, > dest=<value optimized out>) at execMain.c:3231 > #28 0x000000000065a9e3 in ExecutorRun (queryDesc=<value optimized out>, > direction=<value optimized out>, count=<value optimized out>) at > execMain.c:1213 > #29 0x00000000007b576a in ProcessQuery (portal=<value optimized out>, > stmt=0x1f31110, params=<value optimized out>, dest=0x7fca48141ec8, > completionTag=0x7fff0ff8ced0 "") at pquery.c:324 > #30 0x00000000007b5e0f in PortalRunMulti (portal=0x1f09610, isTopLevel=1 > '\001', dest=0x7fca48141ec8, altdest=<value optimized out>, > completionTag=0x7fff0ff8ced0 "") at pquery.c:2051 > #31 0x00000000007b63a2 in PortalRun (portal=<value optimized out>, > count=<value optimized out>, isTopLevel=16 '\020', dest=<value optimized > out>, altdest=<value optimized out>, completionTag=<value optimized out>) > at pquery.c:1596 > #32 0x00000000007b2e61 in exec_mpp_query (argc=<value optimized out>, > argv=<value optimized out>, username=<value optimized out>) at postgres.c:1414 > #33 PostgresMain (argc=<value optimized out>, argv=<value optimized out>, > username=<value optimized out>) at postgres.c:4945 > #34 0x0000000000763f23 in BackendRun (port=0x1df4210) at postmaster.c:5889 > #35 BackendStartup (port=0x1df4210) at postmaster.c:5484 > #36 0x000000000076467d in ServerLoop () at postmaster.c:2163 > #37 0x00000000007664ae in PostmasterMain (argc=9, argv=0x1deb5b0) at > postmaster.c:1454 > #38 0x00000000006c072a in main (argc=9, argv=0x1deb570) at main.c:226 > {code} > From theMaster log files showing connection issues with server > f418p206.f418.local where seg26 is located > {code} > Dispatcher thinks f418p206.f418.local is down.",,,,,,"INSERT INTO > carun_db.CATS_CUSTM_SER1_GROUP_CR / CATS_CUSTM_SER1_GROUP_CR / > SELECT > A.cd_custm, > A.cd_itm_ser_1_group, > A.name_itm_ser_1_group, > MAX(A.ctg_sex) AS ctg_sex, > MAX(CASE > WHEN A.dt_frst_ord_cmprd_ser_1_g IS NULL > THEN '9' > WHEN A.dt_frst_ord_trial_ser_1_g IS NOT NULL AND A.dt_frst_ord_cmprd_ser_1_g > IS NOT NULL > AND A.dt_frst_ord_trial_ser_1_g < A.dt_frst_ord_cmprd_ser_1_g > THEN '2' > WHEN A.dt_frst_ord_smpl_ser_1_g IS NOT NULL AND A.dt_frst_ord_cmprd_ser_1_g > IS NOT NULL > AND A.dt_frst_ord_smpl_ser_1_g < A.dt_frst_ord_cmprd_ser_1_g > THEN '1' > ELSE '6' > END) AS ctg_trial_drct, > MIN(A.flg_dm_allow_now) AS flg_dm_allow_now, > MIN(A.flg_em_allow_now) AS flg_em_allow_now, > MIN(A.flg_tel_allow_now) AS flg_tel_allow_now, > MIN(A.flg_ctel_allow_now) AS flg_ctel_allow_now, > MAX(A.ctg_prdc_tr_now_ser_1_g) AS ctg_prdc_tr_now_ser_1_g, > MAX(CASE > WHEN A.ctg_prdc_tr_now_ser_1_g IN ('0','1') THEN '02' > ELSE '01' > 2017-01-04 03:04:09.475256 > JST,"carund","dwhrun",p551568,th-1936484224,"192.168.10.14","3885",2017-01-04 > 03:03:05 > JST,968689,con198950,cmd4,seg-1,,,x968689,sx1,"LOG","00000","Dispatcher sends > 3 failed host(s) to resource manager.",,,,,,"INSERT INTO > carun_db.CATS_CUSTM_SER1_GROUP_CR / CATS_CUSTM_SER1_GROUP_CR / > 2017-01-04 03:04:09.475828 > JST,,,p214748,th-1936484224,,,,0,con152207,,seg10000,,,,,"LOG","00000","Resource > manager does not probe the status of host f418p206.f418.local because it is > in RUAlive pending status already.",,,,,,,0,,"requesthandler.c",1059, > 2017-01-04 03:04:09.475835 > JST,,,p214748,th-1936484224,,,,0,con152207,,seg-10000,,,,,"LOG","00000","Resource > manager does not probe the status of host f418p206.f418.local because it is > in RUAlive pending status already.",,,,,,,0,,"requesthandler.c",1059, > 2017-01-04 03:04:09.475851 > JST,"carund","dwhrun",p551568,th-1936484224,"192.168.10.14","3885",2017-01-04 > 03:03:05 JST,968689,con198950,cmd4,seg-1,,,x968689,sx1,"LOG","00000","Succeed > in sending failed host to resource manager.",,,,,,"INSERT INTO > carun_db.CATS_CUSTM_SER1_GROUP_CR / CATS_CUSTM_SER1_GROUP_CR / > SELECT > A.cd_custm, > A.cd_itm_ser_1_group, > 2017-01-04 03:04:09.479952 > JST,,,p214748,th-1936484224,,,,0,con152207,,seg-10000,,,,,"LOG","00000","Master > RM finds segment:f418p206.f418.local 's RM process has restarted. old > status:1",,,,,,,0,,"resourcepool.c",1122, > 2017-01-04 03:04:09.479971 > JST,,,p214748,th-1936484224,,,,0,con152207,,seg-10000,,,,,"LOG","00000","Host > f418p206.f418.local is set availability 2. Cluster currently has 11 available > nodes.",,,,,,,0,,"resourcepool.c",1833, > 2017-01-04 03:04:09.480841 > JST,"gpadmin","template1",p551911,th-1936484224,"[local]",,2017-01-04 > 03:04:09 JST,968702,,cmd1,seg-10000,,,x968702,sx1,"LOG","00000","statement: > BEGIN",,,,,,"BEGIN",0,,"postgres.c",1575, > 2017-01-04 03:04:09.480943 > JST,"gpadmin","template1",p551911,th-1936484224,"[local]",,2017-01-04 > 03:04:09 JST,968702,,cmd1,seg-10000,,,x968702,sx1,"LOG","00000","duration: > 0.190 ms",,,,,,"BEGIN",0,,"postgres.c",1834, > 2017-01-04 03:04:09.481069 > JST,"gpadmin","template1",p551911,th-1936484224,"[local]",,2017-01-04 > 03:04:09 JST,968702,,cmd2,seg-10000,,,x968702,sx1,"LOG","00000","statement: > UPDATE gp_segment_configuration SET status='d', description='resource manager > process was reset' WHERE registration_order=2",,,,,,"UPDATE > gp_segment_configuration SET status='d', description='resource manager > process was reset' WHERE registration_order=2",0,,"postgres.c",1575, > {code} > All the log files and core file are located on the scripts server: > gpadmin@10.65.109.130 > passwd: changeme > cdsr 46429 > Go to HDB_log_files_2017-01-04_134818/HDB_info to for HAWQ settings. > Go to HDB_log_files_2017-01-04_134818/logs/<hostname> for segment log files > f418p101 is HAWQ master. > f418p102 is HAWQ standby master. > f418p206 is HAWQ server hosting seg26 that had the crash recovery -- This message was sent by Atlassian JIRA (v6.3.15#6346)