Re: [ceph-users] pgs stuck unclean since forever, current state active+remapped

2013-08-15 Thread 不坏阿峰
many thanks . i did and resolved it by :

#ceph osd getcrushmap -o /tmp/crush
#crushtool -i /tmp/crush --enable-unsafe-tunables
--set-choose-local-tries 0 --set-choose-local-fallback-tries 0
--set-choose-total-tries 50 -o /tmp/crush.new
root@ceph-admin:/etc/ceph# ceph osd setcrushmap -i /tmp/crush.new

so far , health ok

2013/8/16 Gregory Farnum :
> They're unclean because CRUSH isn't generating an acting set of
> sufficient size so the OSDs/monitors are keeping them remapped in
> order to maintain replication guarantees. Look in the docs for the
> crush tunables options for a discussion on this.
> -Greg
> Software Engineer #42 @ http://inktank.com | http://ceph.com
>
>
> On Mon, Aug 12, 2013 at 7:16 PM, 不坏阿峰  wrote:
>> i got PGs stuck long time.   do not how to fix it.  can some person help to
>> check?
>>
>> Environment: Debian 7 + ceph 0.617
>>
>> 
>> root@ceph-admin:~# ceph -s
>>health HEALTH_WARN 6 pgs stuck unclean
>>monmap e2: 2 mons at {a=192.168.250.15:6789/0,b=192.168.250.8:6789/0},
>> election epoch 8, quorum 0,1 a,b
>>osdmap e159: 4 osds: 4 up, 4 in
>> pgmap v23487: 584 pgs: 578 active+clean, 6 active+remapped; 4513 MB
>> data, 12658 MB used, 387 GB / 399 GB avail; 426B/s wr, 0op/s
>>mdsmap e114: 1/1/1 up {0=a=up:active}, 1 up:standby
>>
>> --
>> root@ceph-admin:~# ceph health detail
>> HEALTH_WARN 6 pgs stuck unclean
>> pg 0.50 is stuck unclean since forever, current state active+remapped, last
>> acting [3,1]
>> pg 1.4f is stuck unclean since forever, current state active+remapped, last
>> acting [3,1]
>> pg 2.4e is stuck unclean since forever, current state active+remapped, last
>> acting [3,1]
>> pg 1.8a is stuck unclean since forever, current state active+remapped, last
>> acting [2,1]
>> pg 0.8b is stuck unclean since forever, current state active+remapped, last
>> acting [2,1]
>> pg 2.89 is stuck unclean since forever, current state active+remapped, last
>> acting [2,1]
>> --
>> root@ceph-admin:~# ceph osd tree
>>
>> # idweight  type name   up/down reweight
>> -1  4   root default
>> -3  2rack unknownrack
>> -2  2   host ceph-admin
>> 0   1   osd.0   up  1
>> 1   1   osd.1   up  1
>> -4  1host ceph-node02
>> 2   1   osd.2   down1
>> -5  1host ceph-node01
>> 3   1   osd.3   up  1
>> ---
>> root@ceph-admin:~# ceph osd dump
>>
>> epoch 159
>> fsid db32486a-7ad3-4afe-8b67-49ee2a6dcecf
>> created 2013-08-08 13:45:52.579015
>> modified 2013-08-12 05:18:37.895385
>> flags
>>
>> pool 0 'data' rep size 2 min_size 1 crush_ruleset 0 object_hash rjenkins
>> pg_num 192 pgp_num 192 last_change 1 owner 0 crash_replay_interval 45
>> pool 1 'metadata' rep size 2 min_size 1 crush_ruleset 1 object_hash rjenkins
>> pg_num 192 pgp_num 192 last_change 1 owner 0
>> pool 2 'rbd' rep size 2 min_size 1 crush_ruleset 2 object_hash rjenkins
>> pg_num 192 pgp_num 192 last_change 1 owner 0
>> pool 3 'volumes' rep size 2 min_size 1 crush_ruleset 0 object_hash rjenkins
>> pg_num 8 pgp_num 8 last_change 39 owner 18446744073709551615
>>
>> max_osd 5
>> osd.0 up   in  weight 1 up_from 138 up_thru 157 down_at 137
>> last_clean_interval [45,135) 192.168.250.15:6803/5735
>> 192.168.250.15:6804/5735 192.168.250.15:6805/5735 exists,up
>> 99f2aec0-2367-4b68-86f2-58d6d41589c6
>> osd.1 up   in  weight 1 up_from 140 up_thru 157 down_at 137
>> last_clean_interval [47,136) 192.168.250.15:6806/6882
>> 192.168.250.15:6807/6882 192.168.250.15:6808/6882 exists,up
>> d458ca35-ec55-47a9-a7ce-47b9ddf4d889
>> osd.2 up   in  weight 1 up_from 157 up_thru 158 down_at 135
>> last_clean_interval [48,134) 192.168.250.8:6800/3564 192.168.250.8:6801/3564
>> 192.168.250.8:6802/3564 exists,up c4ee9f05-bd5f-4536-8cb8-0af82c00d3d6
>> osd.3 up   in  weight 1 up_from 143 up_thru 157 down_at 141
>> last_clean_interval [53,141) 192.168.250.16:6802/14618
>> 192.168.250.16:6804/14618 192.168.250.16:6805/14618 exists,up
>> e9d67b85-97d1-4635-95c8-f7c50cd7f6b1
>>
>> pg_temp 0.50 [3,1]
>> pg_temp 0.8b [2,1]
>> pg_temp 1.4f [3,1]
>> pg_temp 1.8a [2,1]
>> pg_temp 2.4e [3,1]
>> pg_temp 2.89 [2,1]
>> --
>> root@ceph-admin:/etc/ceph# crushtool -d /tmp/crushmap
>> # begin crush map
>>
>> # devices
>> device 0 osd.0
>> device 1 osd.1
>> device 2 osd.2
>> device 3 osd.3
>>
>> # types
>> type 0 osd
>> type 1 host
>> type 2 rack
>> type 3 row
>> type 4 room
>> type 5 datacenter
>> type 6 root
>>
>> # buckets
>> host ceph-admin {
>> id -2   # do not change unnecessarily
>> # weight 2.000
>> alg straw
>> hash 0  # rjenkins1
>> item osd.0 weight 1.000
>> item osd.1 weight 1.000
>> }
>> rack unknownrack {
>> id -3   # do not change unnecessarily
>> # weight 2.000
>> alg straw
>> hash 0  # rjenkins1
>> item ce

Re: [ceph-users] pgs stuck unclean since forever, current state active+remapped

2013-08-15 Thread Gregory Farnum
They're unclean because CRUSH isn't generating an acting set of
sufficient size so the OSDs/monitors are keeping them remapped in
order to maintain replication guarantees. Look in the docs for the
crush tunables options for a discussion on this.
-Greg
Software Engineer #42 @ http://inktank.com | http://ceph.com


On Mon, Aug 12, 2013 at 7:16 PM, 不坏阿峰  wrote:
> i got PGs stuck long time.   do not how to fix it.  can some person help to
> check?
>
> Environment: Debian 7 + ceph 0.617
>
> 
> root@ceph-admin:~# ceph -s
>health HEALTH_WARN 6 pgs stuck unclean
>monmap e2: 2 mons at {a=192.168.250.15:6789/0,b=192.168.250.8:6789/0},
> election epoch 8, quorum 0,1 a,b
>osdmap e159: 4 osds: 4 up, 4 in
> pgmap v23487: 584 pgs: 578 active+clean, 6 active+remapped; 4513 MB
> data, 12658 MB used, 387 GB / 399 GB avail; 426B/s wr, 0op/s
>mdsmap e114: 1/1/1 up {0=a=up:active}, 1 up:standby
>
> --
> root@ceph-admin:~# ceph health detail
> HEALTH_WARN 6 pgs stuck unclean
> pg 0.50 is stuck unclean since forever, current state active+remapped, last
> acting [3,1]
> pg 1.4f is stuck unclean since forever, current state active+remapped, last
> acting [3,1]
> pg 2.4e is stuck unclean since forever, current state active+remapped, last
> acting [3,1]
> pg 1.8a is stuck unclean since forever, current state active+remapped, last
> acting [2,1]
> pg 0.8b is stuck unclean since forever, current state active+remapped, last
> acting [2,1]
> pg 2.89 is stuck unclean since forever, current state active+remapped, last
> acting [2,1]
> --
> root@ceph-admin:~# ceph osd tree
>
> # idweight  type name   up/down reweight
> -1  4   root default
> -3  2rack unknownrack
> -2  2   host ceph-admin
> 0   1   osd.0   up  1
> 1   1   osd.1   up  1
> -4  1host ceph-node02
> 2   1   osd.2   down1
> -5  1host ceph-node01
> 3   1   osd.3   up  1
> ---
> root@ceph-admin:~# ceph osd dump
>
> epoch 159
> fsid db32486a-7ad3-4afe-8b67-49ee2a6dcecf
> created 2013-08-08 13:45:52.579015
> modified 2013-08-12 05:18:37.895385
> flags
>
> pool 0 'data' rep size 2 min_size 1 crush_ruleset 0 object_hash rjenkins
> pg_num 192 pgp_num 192 last_change 1 owner 0 crash_replay_interval 45
> pool 1 'metadata' rep size 2 min_size 1 crush_ruleset 1 object_hash rjenkins
> pg_num 192 pgp_num 192 last_change 1 owner 0
> pool 2 'rbd' rep size 2 min_size 1 crush_ruleset 2 object_hash rjenkins
> pg_num 192 pgp_num 192 last_change 1 owner 0
> pool 3 'volumes' rep size 2 min_size 1 crush_ruleset 0 object_hash rjenkins
> pg_num 8 pgp_num 8 last_change 39 owner 18446744073709551615
>
> max_osd 5
> osd.0 up   in  weight 1 up_from 138 up_thru 157 down_at 137
> last_clean_interval [45,135) 192.168.250.15:6803/5735
> 192.168.250.15:6804/5735 192.168.250.15:6805/5735 exists,up
> 99f2aec0-2367-4b68-86f2-58d6d41589c6
> osd.1 up   in  weight 1 up_from 140 up_thru 157 down_at 137
> last_clean_interval [47,136) 192.168.250.15:6806/6882
> 192.168.250.15:6807/6882 192.168.250.15:6808/6882 exists,up
> d458ca35-ec55-47a9-a7ce-47b9ddf4d889
> osd.2 up   in  weight 1 up_from 157 up_thru 158 down_at 135
> last_clean_interval [48,134) 192.168.250.8:6800/3564 192.168.250.8:6801/3564
> 192.168.250.8:6802/3564 exists,up c4ee9f05-bd5f-4536-8cb8-0af82c00d3d6
> osd.3 up   in  weight 1 up_from 143 up_thru 157 down_at 141
> last_clean_interval [53,141) 192.168.250.16:6802/14618
> 192.168.250.16:6804/14618 192.168.250.16:6805/14618 exists,up
> e9d67b85-97d1-4635-95c8-f7c50cd7f6b1
>
> pg_temp 0.50 [3,1]
> pg_temp 0.8b [2,1]
> pg_temp 1.4f [3,1]
> pg_temp 1.8a [2,1]
> pg_temp 2.4e [3,1]
> pg_temp 2.89 [2,1]
> --
> root@ceph-admin:/etc/ceph# crushtool -d /tmp/crushmap
> # begin crush map
>
> # devices
> device 0 osd.0
> device 1 osd.1
> device 2 osd.2
> device 3 osd.3
>
> # types
> type 0 osd
> type 1 host
> type 2 rack
> type 3 row
> type 4 room
> type 5 datacenter
> type 6 root
>
> # buckets
> host ceph-admin {
> id -2   # do not change unnecessarily
> # weight 2.000
> alg straw
> hash 0  # rjenkins1
> item osd.0 weight 1.000
> item osd.1 weight 1.000
> }
> rack unknownrack {
> id -3   # do not change unnecessarily
> # weight 2.000
> alg straw
> hash 0  # rjenkins1
> item ceph-admin weight 2.000
> }
> host ceph-node02 {
> id -4   # do not change unnecessarily
> # weight 1.000
> alg straw
> hash 0  # rjenkins1
> item osd.2 weight 1.000
> }
> host ceph-node01 {
> id -5   # do not change unnecessarily
> # weight 1.000
> alg straw
> hash 0  # rjenkins1
> item osd.3 weight 1.000
> }
> root default {
> id -1   # do not change unnecessarily
>