Re: [ceph-users] pgs stuck unclean since forever, current state active+remapped

2013-08-15 Thread Gregory Farnum
They're unclean because CRUSH isn't generating an acting set of
sufficient size so the OSDs/monitors are keeping them remapped in
order to maintain replication guarantees. Look in the docs for the
crush tunables options for a discussion on this.
-Greg
Software Engineer #42 @ http://inktank.com | http://ceph.com


On Mon, Aug 12, 2013 at 7:16 PM, 不坏阿峰 onlydeb...@gmail.com wrote:
 i got PGs stuck long time.   do not how to fix it.  can some person help to
 check?

 Environment: Debian 7 + ceph 0.617

 
 root@ceph-admin:~# ceph -s
health HEALTH_WARN 6 pgs stuck unclean
monmap e2: 2 mons at {a=192.168.250.15:6789/0,b=192.168.250.8:6789/0},
 election epoch 8, quorum 0,1 a,b
osdmap e159: 4 osds: 4 up, 4 in
 pgmap v23487: 584 pgs: 578 active+clean, 6 active+remapped; 4513 MB
 data, 12658 MB used, 387 GB / 399 GB avail; 426B/s wr, 0op/s
mdsmap e114: 1/1/1 up {0=a=up:active}, 1 up:standby

 --
 root@ceph-admin:~# ceph health detail
 HEALTH_WARN 6 pgs stuck unclean
 pg 0.50 is stuck unclean since forever, current state active+remapped, last
 acting [3,1]
 pg 1.4f is stuck unclean since forever, current state active+remapped, last
 acting [3,1]
 pg 2.4e is stuck unclean since forever, current state active+remapped, last
 acting [3,1]
 pg 1.8a is stuck unclean since forever, current state active+remapped, last
 acting [2,1]
 pg 0.8b is stuck unclean since forever, current state active+remapped, last
 acting [2,1]
 pg 2.89 is stuck unclean since forever, current state active+remapped, last
 acting [2,1]
 --
 root@ceph-admin:~# ceph osd tree

 # idweight  type name   up/down reweight
 -1  4   root default
 -3  2rack unknownrack
 -2  2   host ceph-admin
 0   1   osd.0   up  1
 1   1   osd.1   up  1
 -4  1host ceph-node02
 2   1   osd.2   down1
 -5  1host ceph-node01
 3   1   osd.3   up  1
 ---
 root@ceph-admin:~# ceph osd dump

 epoch 159
 fsid db32486a-7ad3-4afe-8b67-49ee2a6dcecf
 created 2013-08-08 13:45:52.579015
 modified 2013-08-12 05:18:37.895385
 flags

 pool 0 'data' rep size 2 min_size 1 crush_ruleset 0 object_hash rjenkins
 pg_num 192 pgp_num 192 last_change 1 owner 0 crash_replay_interval 45
 pool 1 'metadata' rep size 2 min_size 1 crush_ruleset 1 object_hash rjenkins
 pg_num 192 pgp_num 192 last_change 1 owner 0
 pool 2 'rbd' rep size 2 min_size 1 crush_ruleset 2 object_hash rjenkins
 pg_num 192 pgp_num 192 last_change 1 owner 0
 pool 3 'volumes' rep size 2 min_size 1 crush_ruleset 0 object_hash rjenkins
 pg_num 8 pgp_num 8 last_change 39 owner 18446744073709551615

 max_osd 5
 osd.0 up   in  weight 1 up_from 138 up_thru 157 down_at 137
 last_clean_interval [45,135) 192.168.250.15:6803/5735
 192.168.250.15:6804/5735 192.168.250.15:6805/5735 exists,up
 99f2aec0-2367-4b68-86f2-58d6d41589c6
 osd.1 up   in  weight 1 up_from 140 up_thru 157 down_at 137
 last_clean_interval [47,136) 192.168.250.15:6806/6882
 192.168.250.15:6807/6882 192.168.250.15:6808/6882 exists,up
 d458ca35-ec55-47a9-a7ce-47b9ddf4d889
 osd.2 up   in  weight 1 up_from 157 up_thru 158 down_at 135
 last_clean_interval [48,134) 192.168.250.8:6800/3564 192.168.250.8:6801/3564
 192.168.250.8:6802/3564 exists,up c4ee9f05-bd5f-4536-8cb8-0af82c00d3d6
 osd.3 up   in  weight 1 up_from 143 up_thru 157 down_at 141
 last_clean_interval [53,141) 192.168.250.16:6802/14618
 192.168.250.16:6804/14618 192.168.250.16:6805/14618 exists,up
 e9d67b85-97d1-4635-95c8-f7c50cd7f6b1

 pg_temp 0.50 [3,1]
 pg_temp 0.8b [2,1]
 pg_temp 1.4f [3,1]
 pg_temp 1.8a [2,1]
 pg_temp 2.4e [3,1]
 pg_temp 2.89 [2,1]
 --
 root@ceph-admin:/etc/ceph# crushtool -d /tmp/crushmap
 # begin crush map

 # devices
 device 0 osd.0
 device 1 osd.1
 device 2 osd.2
 device 3 osd.3

 # types
 type 0 osd
 type 1 host
 type 2 rack
 type 3 row
 type 4 room
 type 5 datacenter
 type 6 root

 # buckets
 host ceph-admin {
 id -2   # do not change unnecessarily
 # weight 2.000
 alg straw
 hash 0  # rjenkins1
 item osd.0 weight 1.000
 item osd.1 weight 1.000
 }
 rack unknownrack {
 id -3   # do not change unnecessarily
 # weight 2.000
 alg straw
 hash 0  # rjenkins1
 item ceph-admin weight 2.000
 }
 host ceph-node02 {
 id -4   # do not change unnecessarily
 # weight 1.000
 alg straw
 hash 0  # rjenkins1
 item osd.2 weight 1.000
 }
 host ceph-node01 {
 id -5   # do not change unnecessarily
 # weight 1.000
 alg straw
 hash 0  # rjenkins1
 item osd.3 weight 1.000
 }
 root default {
 id -1   # do not change unnecessarily
 # weight 4.000
 alg straw
 hash 0  # rjenkins1
 item unknownrack weight 2.000
 

Re: [ceph-users] pgs stuck unclean since forever, current state active+remapped

2013-08-15 Thread 不坏阿峰
many thanks . i did and resolved it by :

#ceph osd getcrushmap -o /tmp/crush
#crushtool -i /tmp/crush --enable-unsafe-tunables
--set-choose-local-tries 0 --set-choose-local-fallback-tries 0
--set-choose-total-tries 50 -o /tmp/crush.new
root@ceph-admin:/etc/ceph# ceph osd setcrushmap -i /tmp/crush.new

so far , health ok

2013/8/16 Gregory Farnum g...@inktank.com:
 They're unclean because CRUSH isn't generating an acting set of
 sufficient size so the OSDs/monitors are keeping them remapped in
 order to maintain replication guarantees. Look in the docs for the
 crush tunables options for a discussion on this.
 -Greg
 Software Engineer #42 @ http://inktank.com | http://ceph.com


 On Mon, Aug 12, 2013 at 7:16 PM, 不坏阿峰 onlydeb...@gmail.com wrote:
 i got PGs stuck long time.   do not how to fix it.  can some person help to
 check?

 Environment: Debian 7 + ceph 0.617

 
 root@ceph-admin:~# ceph -s
health HEALTH_WARN 6 pgs stuck unclean
monmap e2: 2 mons at {a=192.168.250.15:6789/0,b=192.168.250.8:6789/0},
 election epoch 8, quorum 0,1 a,b
osdmap e159: 4 osds: 4 up, 4 in
 pgmap v23487: 584 pgs: 578 active+clean, 6 active+remapped; 4513 MB
 data, 12658 MB used, 387 GB / 399 GB avail; 426B/s wr, 0op/s
mdsmap e114: 1/1/1 up {0=a=up:active}, 1 up:standby

 --
 root@ceph-admin:~# ceph health detail
 HEALTH_WARN 6 pgs stuck unclean
 pg 0.50 is stuck unclean since forever, current state active+remapped, last
 acting [3,1]
 pg 1.4f is stuck unclean since forever, current state active+remapped, last
 acting [3,1]
 pg 2.4e is stuck unclean since forever, current state active+remapped, last
 acting [3,1]
 pg 1.8a is stuck unclean since forever, current state active+remapped, last
 acting [2,1]
 pg 0.8b is stuck unclean since forever, current state active+remapped, last
 acting [2,1]
 pg 2.89 is stuck unclean since forever, current state active+remapped, last
 acting [2,1]
 --
 root@ceph-admin:~# ceph osd tree

 # idweight  type name   up/down reweight
 -1  4   root default
 -3  2rack unknownrack
 -2  2   host ceph-admin
 0   1   osd.0   up  1
 1   1   osd.1   up  1
 -4  1host ceph-node02
 2   1   osd.2   down1
 -5  1host ceph-node01
 3   1   osd.3   up  1
 ---
 root@ceph-admin:~# ceph osd dump

 epoch 159
 fsid db32486a-7ad3-4afe-8b67-49ee2a6dcecf
 created 2013-08-08 13:45:52.579015
 modified 2013-08-12 05:18:37.895385
 flags

 pool 0 'data' rep size 2 min_size 1 crush_ruleset 0 object_hash rjenkins
 pg_num 192 pgp_num 192 last_change 1 owner 0 crash_replay_interval 45
 pool 1 'metadata' rep size 2 min_size 1 crush_ruleset 1 object_hash rjenkins
 pg_num 192 pgp_num 192 last_change 1 owner 0
 pool 2 'rbd' rep size 2 min_size 1 crush_ruleset 2 object_hash rjenkins
 pg_num 192 pgp_num 192 last_change 1 owner 0
 pool 3 'volumes' rep size 2 min_size 1 crush_ruleset 0 object_hash rjenkins
 pg_num 8 pgp_num 8 last_change 39 owner 18446744073709551615

 max_osd 5
 osd.0 up   in  weight 1 up_from 138 up_thru 157 down_at 137
 last_clean_interval [45,135) 192.168.250.15:6803/5735
 192.168.250.15:6804/5735 192.168.250.15:6805/5735 exists,up
 99f2aec0-2367-4b68-86f2-58d6d41589c6
 osd.1 up   in  weight 1 up_from 140 up_thru 157 down_at 137
 last_clean_interval [47,136) 192.168.250.15:6806/6882
 192.168.250.15:6807/6882 192.168.250.15:6808/6882 exists,up
 d458ca35-ec55-47a9-a7ce-47b9ddf4d889
 osd.2 up   in  weight 1 up_from 157 up_thru 158 down_at 135
 last_clean_interval [48,134) 192.168.250.8:6800/3564 192.168.250.8:6801/3564
 192.168.250.8:6802/3564 exists,up c4ee9f05-bd5f-4536-8cb8-0af82c00d3d6
 osd.3 up   in  weight 1 up_from 143 up_thru 157 down_at 141
 last_clean_interval [53,141) 192.168.250.16:6802/14618
 192.168.250.16:6804/14618 192.168.250.16:6805/14618 exists,up
 e9d67b85-97d1-4635-95c8-f7c50cd7f6b1

 pg_temp 0.50 [3,1]
 pg_temp 0.8b [2,1]
 pg_temp 1.4f [3,1]
 pg_temp 1.8a [2,1]
 pg_temp 2.4e [3,1]
 pg_temp 2.89 [2,1]
 --
 root@ceph-admin:/etc/ceph# crushtool -d /tmp/crushmap
 # begin crush map

 # devices
 device 0 osd.0
 device 1 osd.1
 device 2 osd.2
 device 3 osd.3

 # types
 type 0 osd
 type 1 host
 type 2 rack
 type 3 row
 type 4 room
 type 5 datacenter
 type 6 root

 # buckets
 host ceph-admin {
 id -2   # do not change unnecessarily
 # weight 2.000
 alg straw
 hash 0  # rjenkins1
 item osd.0 weight 1.000
 item osd.1 weight 1.000
 }
 rack unknownrack {
 id -3   # do not change unnecessarily
 # weight 2.000
 alg straw
 hash 0  # rjenkins1
 item ceph-admin weight 2.000
 }
 host ceph-node02 {
 id -4   # do not change unnecessarily
 # weight 1.000
 alg straw
 hash 0  # rjenkins1
 item osd.2 weight 1.000
 }
 

[ceph-users] pgs stuck unclean since forever, current state active+remapped

2013-08-12 Thread 不坏阿峰
i got PGs stuck long time.   do not how to fix it.  can some person help to
check?

Environment: Debian 7 + ceph 0.617


root@ceph-admin:~# ceph -s
   health HEALTH_WARN 6 pgs stuck unclean
   monmap e2: 2 mons at {a=192.168.250.15:6789/0,b=192.168.250.8:6789/0},
election epoch 8, quorum 0,1 a,b
   osdmap e159: 4 osds: 4 up, 4 in
pgmap v23487: 584 pgs: 578 active+clean, 6 active+remapped; 4513 MB
data, 12658 MB used, 387 GB / 399 GB avail; 426B/s wr, 0op/s
   mdsmap e114: 1/1/1 up {0=a=up:active}, 1 up:standby

--
root@ceph-admin:~# ceph health detail
HEALTH_WARN 6 pgs stuck unclean
pg 0.50 is stuck unclean since forever, current state active+remapped, last
acting [3,1]
pg 1.4f is stuck unclean since forever, current state active+remapped, last
acting [3,1]
pg 2.4e is stuck unclean since forever, current state active+remapped, last
acting [3,1]
pg 1.8a is stuck unclean since forever, current state active+remapped, last
acting [2,1]
pg 0.8b is stuck unclean since forever, current state active+remapped, last
acting [2,1]
pg 2.89 is stuck unclean since forever, current state active+remapped, last
acting [2,1]
--
root@ceph-admin:~# ceph osd tree

# idweight  type name   up/down reweight
-1  4   root default
-3  2rack unknownrack
-2  2   host ceph-admin
0   1   osd.0   up  1
1   1   osd.1   up  1
-4  1host ceph-node02
2   1   osd.2   down1
-5  1host ceph-node01
3   1   osd.3   up  1
---
root@ceph-admin:~# ceph osd dump

epoch 159
fsid db32486a-7ad3-4afe-8b67-49ee2a6dcecf
created 2013-08-08 13:45:52.579015
modified 2013-08-12 05:18:37.895385
flags

pool 0 'data' rep size 2 min_size 1 crush_ruleset 0 object_hash rjenkins
pg_num 192 pgp_num 192 last_change 1 owner 0 crash_replay_interval 45
pool 1 'metadata' rep size 2 min_size 1 crush_ruleset 1 object_hash
rjenkins pg_num 192 pgp_num 192 last_change 1 owner 0
pool 2 'rbd' rep size 2 min_size 1 crush_ruleset 2 object_hash rjenkins
pg_num 192 pgp_num 192 last_change 1 owner 0
pool 3 'volumes' rep size 2 min_size 1 crush_ruleset 0 object_hash rjenkins
pg_num 8 pgp_num 8 last_change 39 owner 18446744073709551615

max_osd 5
osd.0 up   in  weight 1 up_from 138 up_thru 157 down_at 137
last_clean_interval [45,135) 192.168.250.15:6803/5735
192.168.250.15:6804/5735 192.168.250.15:6805/5735 exists,up
99f2aec0-2367-4b68-86f2-58d6d41589c6
osd.1 up   in  weight 1 up_from 140 up_thru 157 down_at 137
last_clean_interval [47,136) 192.168.250.15:6806/6882
192.168.250.15:6807/6882 192.168.250.15:6808/6882 exists,up
d458ca35-ec55-47a9-a7ce-47b9ddf4d889
osd.2 up   in  weight 1 up_from 157 up_thru 158 down_at 135
last_clean_interval [48,134) 192.168.250.8:6800/3564 192.168.250.8:6801/3564
192.168.250.8:6802/3564 exists,up c4ee9f05-bd5f-4536-8cb8-0af82c00d3d6
osd.3 up   in  weight 1 up_from 143 up_thru 157 down_at 141
last_clean_interval [53,141) 192.168.250.16:6802/14618
192.168.250.16:6804/14618 192.168.250.16:6805/14618 exists,up
e9d67b85-97d1-4635-95c8-f7c50cd7f6b1

pg_temp 0.50 [3,1]
pg_temp 0.8b [2,1]
pg_temp 1.4f [3,1]
pg_temp 1.8a [2,1]
pg_temp 2.4e [3,1]
pg_temp 2.89 [2,1]
--
root@ceph-admin:/etc/ceph# crushtool -d /tmp/crushmap
# begin crush map

# devices
device 0 osd.0
device 1 osd.1
device 2 osd.2
device 3 osd.3

# types
type 0 osd
type 1 host
type 2 rack
type 3 row
type 4 room
type 5 datacenter
type 6 root

# buckets
host ceph-admin {
id -2   # do not change unnecessarily
# weight 2.000
alg straw
hash 0  # rjenkins1
item osd.0 weight 1.000
item osd.1 weight 1.000
}
rack unknownrack {
id -3   # do not change unnecessarily
# weight 2.000
alg straw
hash 0  # rjenkins1
item ceph-admin weight 2.000
}
host ceph-node02 {
id -4   # do not change unnecessarily
# weight 1.000
alg straw
hash 0  # rjenkins1
item osd.2 weight 1.000
}
host ceph-node01 {
id -5   # do not change unnecessarily
# weight 1.000
alg straw
hash 0  # rjenkins1
item osd.3 weight 1.000
}
root default {
id -1   # do not change unnecessarily
# weight 4.000
alg straw
hash 0  # rjenkins1
item unknownrack weight 2.000
item ceph-node02 weight 1.000
item ceph-node01 weight 1.000
}

# rules
rule data {
ruleset 0
type replicated
min_size 1
max_size 10
step take default
step choose firstn 0 type osd
step emit
}
rule volumes {
ruleset 3
type replicated
min_size 1
max_size 10
step take default
step choose firstn 0 type osd
step emit
}
rule metadata {
ruleset 1
type replicated
min_size