On Fri, Jul 28, 2017 at 05:52:29PM +0800, linghucongsong wrote: > > > > You have two crush rule? One is ssd the other is hdd? yes, exactly..
> > Can you show ceph osd dump|grep pool > pool 3 'vm' replicated size 3 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 1024 pgp_num 1024 last_change 69955 flags hashpspool min_read_recency_for_promote 1 min_write_recency_for_promote 1 stripe_width 0 pool 4 'cephfs_data' replicated size 3 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 1024 pgp_num 1024 last_change 74682 flags hashpspool crash_replay_interval 45 min_write_recency_for_promote 1 stripe_width 0 pool 5 'cephfs_metadata' replicated size 3 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 1024 pgp_num 1024 last_change 74667 flags hashpspool min_write_recency_for_promote 1 stripe_width 0 pool 11 'ssd' replicated size 3 min_size 1 crush_ruleset 1 object_hash rjenkins pg_num 128 pgp_num 128 last_change 46119 flags hashpspool min_write_recency_for_promote 1 stripe_width 0 > ceph osd crush dump { "devices": [ { "id": 0, "name": "osd.0" }, { "id": 1, "name": "osd.1" }, { "id": 2, "name": "osd.2" }, { "id": 3, "name": "osd.3" }, { "id": 4, "name": "osd.4" }, { "id": 5, "name": "osd.5" }, { "id": 6, "name": "osd.6" }, { "id": 7, "name": "device7" }, { "id": 8, "name": "osd.8" }, { "id": 9, "name": "osd.9" }, { "id": 10, "name": "osd.10" }, { "id": 11, "name": "osd.11" }, { "id": 12, "name": "osd.12" }, { "id": 13, "name": "osd.13" }, { "id": 14, "name": "osd.14" }, { "id": 15, "name": "osd.15" }, { "id": 16, "name": "osd.16" }, { "id": 17, "name": "osd.17" }, { "id": 18, "name": "osd.18" }, { "id": 19, "name": "osd.19" }, { "id": 20, "name": "osd.20" }, { "id": 21, "name": "osd.21" }, { "id": 22, "name": "osd.22" }, { "id": 23, "name": "osd.23" }, { "id": 24, "name": "osd.24" }, { "id": 25, "name": "osd.25" }, { "id": 26, "name": "osd.26" } ], "types": [ { "type_id": 0, "name": "osd" }, { "type_id": 1, "name": "host" }, { "type_id": 2, "name": "chassis" }, { "type_id": 3, "name": "rack" }, { "type_id": 4, "name": "row" }, { "type_id": 5, "name": "pdu" }, { "type_id": 6, "name": "pod" }, { "type_id": 7, "name": "room" }, { "type_id": 8, "name": "datacenter" }, { "type_id": 9, "name": "region" }, { "type_id": 10, "name": "root" } ], "buckets": [ { "id": -1, "name": "default", "type_id": 10, "type_name": "root", "weight": 2575553, "alg": "straw2", "hash": "rjenkins1", "items": [ { "id": -4, "weight": 779875, "pos": 0 }, { "id": -5, "weight": 681571, "pos": 1 }, { "id": -6, "weight": 511178, "pos": 2 }, { "id": -3, "weight": 602929, "pos": 3 } ] }, { "id": -2, "name": "ssd", "type_id": 10, "type_name": "root", "weight": 102233, "alg": "straw2", "hash": "rjenkins1", "items": [ { "id": -9, "weight": 26214, "pos": 0 }, { "id": -10, "weight": 39320, "pos": 1 }, { "id": -11, "weight": 22282, "pos": 2 }, { "id": -7, "weight": 14417, "pos": 3 } ] }, { "id": -3, "name": "v1d-sata", "type_id": 1, "type_name": "host", "weight": 602929, "alg": "straw", "hash": "rjenkins1", "items": [ { "id": 12, "weight": 91750, "pos": 0 }, { "id": 20, "weight": 91750, "pos": 1 }, { "id": 21, "weight": 235929, "pos": 2 }, { "id": 22, "weight": 91750, "pos": 3 }, { "id": 23, "weight": 91750, "pos": 4 } ] }, { "id": -4, "name": "v1a", "type_id": 1, "type_name": "host", "weight": 779875, "alg": "straw2", "hash": "rjenkins1", "items": [ { "id": 6, "weight": 104857, "pos": 0 }, { "id": 8, "weight": 117964, "pos": 1 }, { "id": 2, "weight": 104857, "pos": 2 }, { "id": 0, "weight": 111411, "pos": 3 }, { "id": 4, "weight": 104857, "pos": 4 }, { "id": 25, "weight": 235929, "pos": 5 } ] }, { "id": -5, "name": "v1b", "type_id": 1, "type_name": "host", "weight": 681571, "alg": "straw2", "hash": "rjenkins1", "items": [ { "id": 1, "weight": 104857, "pos": 0 }, { "id": 3, "weight": 117964, "pos": 1 }, { "id": 9, "weight": 104857, "pos": 2 }, { "id": 11, "weight": 117964, "pos": 3 }, { "id": 24, "weight": 235929, "pos": 4 } ] }, { "id": -6, "name": "v1c", "type_id": 1, "type_name": "host", "weight": 511178, "alg": "straw2", "hash": "rjenkins1", "items": [ { "id": 14, "weight": 104857, "pos": 0 }, { "id": 15, "weight": 117964, "pos": 1 }, { "id": 16, "weight": 91750, "pos": 2 }, { "id": 18, "weight": 91750, "pos": 3 }, { "id": 17, "weight": 104857, "pos": 4 } ] }, { "id": -7, "name": "v1d-ssd", "type_id": 1, "type_name": "host", "weight": 14417, "alg": "straw", "hash": "rjenkins1", "items": [ { "id": 19, "weight": 14417, "pos": 0 } ] }, { "id": -9, "name": "v1c-ssd", "type_id": 1, "type_name": "host", "weight": 26214, "alg": "straw2", "hash": "rjenkins1", "items": [ { "id": 10, "weight": 26214, "pos": 0 } ] }, { "id": -10, "name": "v1a-ssd", "type_id": 1, "type_name": "host", "weight": 39320, "alg": "straw2", "hash": "rjenkins1", "items": [ { "id": 5, "weight": 19660, "pos": 0 }, { "id": 26, "weight": 19660, "pos": 1 } ] }, { "id": -11, "name": "v1b-ssd", "type_id": 1, "type_name": "host", "weight": 22282, "alg": "straw2", "hash": "rjenkins1", "items": [ { "id": 13, "weight": 22282, "pos": 0 } ] } ], "rules": [ { "rule_id": 0, "rule_name": "replicated_ruleset", "ruleset": 0, "type": 1, "min_size": 1, "max_size": 10, "steps": [ { "op": "take", "item": -1, "item_name": "default" }, { "op": "chooseleaf_firstn", "num": 0, "type": "host" }, { "op": "emit" } ] }, { "rule_id": 1, "rule_name": "ssd", "ruleset": 1, "type": 1, "min_size": 1, "max_size": 10, "steps": [ { "op": "take", "item": -2, "item_name": "ssd" }, { "op": "chooseleaf_firstn", "num": 0, "type": "host" }, { "op": "emit" } ] } ], "tunables": { "choose_local_tries": 0, "choose_local_fallback_tries": 0, "choose_total_tries": 50, "chooseleaf_descend_once": 1, "chooseleaf_vary_r": 1, "chooseleaf_stable": 0, "straw_calc_version": 1, "allowed_bucket_algs": 54, "profile": "hammer", "optimal_tunables": 0, "legacy_tunables": 0, "minimum_required_version": "hammer", "require_feature_tunables": 1, "require_feature_tunables2": 1, "has_v2_rules": 0, "require_feature_tunables3": 1, "has_v3_rules": 0, "has_v4_buckets": 1, "require_feature_tunables5": 0, "has_v5_rules": 0 } } > > > > > > > At 2017-07-28 17:47:48, "Nikola Ciprich" <nikola.cipr...@linuxbox.cz> wrote: > > > >On Fri, Jul 28, 2017 at 05:43:14PM +0800, linghucongsong wrote: > >> > >> > >> It look like the osd in your cluster is not all the same size. > >> > >> can you show ceph osd df output? > > > >you're right, they're not.. here's the output: > > > >[root@v1b ~]# ceph osd df tree > >ID WEIGHT REWEIGHT SIZE USE AVAIL %USE VAR PGS TYPE NAME > > -2 1.55995 - 1706G 883G 805G 51.78 2.55 0 root ssd > > -9 0.39999 - 393G 221G 171G 56.30 2.78 0 host v1c-ssd > > 10 0.39999 1.00000 393G 221G 171G 56.30 2.78 98 osd.10 > >-10 0.59998 - 683G 275G 389G 40.39 1.99 0 host v1a-ssd > > 5 0.29999 1.00000 338G 151G 187G 44.77 2.21 65 osd.5 > > 26 0.29999 1.00000 344G 124G 202G 36.07 1.78 52 osd.26 > >-11 0.34000 - 338G 219G 119G 64.68 3.19 0 host v1b-ssd > > 13 0.34000 1.00000 338G 219G 119G 64.68 3.19 96 osd.13 > > -7 0.21999 - 290G 166G 123G 57.43 2.83 0 host v1d-ssd > > 19 0.21999 1.00000 290G 166G 123G 57.43 2.83 73 osd.19 > > -1 39.29982 - 43658G 8312G 34787G 19.04 0.94 0 root default > > -4 11.89995 - 12806G 2422G 10197G 18.92 0.93 0 host v1a > > 6 1.59999 1.00000 1833G 358G 1475G 19.53 0.96 366 osd.6 > > 8 1.79999 1.00000 1833G 313G 1519G 17.11 0.84 370 osd.8 > > 2 1.59999 1.00000 1833G 320G 1513G 17.46 0.86 331 osd.2 > > 0 1.70000 1.00000 1804G 431G 1373G 23.90 1.18 359 osd.0 > > 4 1.59999 1.00000 1833G 294G 1539G 16.07 0.79 360 osd.4 > > 25 3.59999 1.00000 3667G 704G 2776G 19.22 0.95 745 osd.25 > > -5 10.39995 - 10914G 2154G 8573G 19.74 0.97 0 host v1b > > 1 1.59999 1.00000 1804G 350G 1454G 19.42 0.96 409 osd.1 > > 3 1.79999 1.00000 1804G 360G 1444G 19.98 0.99 412 osd.3 > > 9 1.59999 1.00000 1804G 331G 1473G 18.37 0.91 363 osd.9 > > 11 1.79999 1.00000 1833G 367G 1465G 20.06 0.99 415 osd.11 > > 24 3.59999 1.00000 3667G 744G 2736G 20.30 1.00 834 osd.24 > > -6 7.79996 - 9051G 1769G 7282G 19.54 0.96 0 host v1c > > 14 1.59999 1.00000 1804G 370G 1433G 20.54 1.01 442 osd.14 > > 15 1.79999 1.00000 1833G 383G 1450G 20.92 1.03 447 osd.15 > > 16 1.39999 1.00000 1804G 295G 1508G 16.38 0.81 355 osd.16 > > 18 1.39999 1.00000 1804G 366G 1438G 20.29 1.00 381 osd.18 > > 17 1.59999 1.00000 1804G 353G 1451G 19.57 0.97 429 osd.17 > > -3 9.19997 - 10885G 1965G 8733G 18.06 0.89 0 host v1d-sata > > 12 1.39999 1.00000 1804G 348G 1455G 19.32 0.95 365 osd.12 > > 20 1.39999 1.00000 1804G 335G 1468G 18.60 0.92 371 osd.20 > > 21 3.59999 1.00000 3667G 695G 2785G 18.97 0.94 871 osd.21 > > 22 1.39999 1.00000 1804G 281G 1522G 15.63 0.77 326 osd.22 > > 23 1.39999 1.00000 1804G 303G 1500G 16.83 0.83 321 osd.23 > > TOTAL 45365G 9195G 35592G 20.27 > >MIN/MAX VAR: 0.77/3.19 STDDEV: 14.69 > > > > > > > >apart from replacing OSDs, how can I help it? > > > > > > > > > >> > >> > >> At 2017-07-28 17:24:29, "Nikola Ciprich" <nikola.cipr...@linuxbox.cz> > >> wrote: > >> >I forgot to add that OSD daemons really seem to be idle, no disk > >> >activity, no CPU usage.. it just looks to me like some kind of > >> >deadlock, as they were waiting for each other.. > >> > > >> >and so I'm trying to get last 1.5% of misplaced / degraded PGs > >> >for almost a week.. > >> > > >> > > >> >On Fri, Jul 28, 2017 at 10:56:02AM +0200, Nikola Ciprich wrote: > >> >> Hi, > >> >> > >> >> I'm trying to find reason for strange recovery issues I'm seeing on > >> >> our cluster.. > >> >> > >> >> it's mostly idle, 4 node cluster with 26 OSDs evenly distributed > >> >> across nodes. jewel 10.2.9 > >> >> > >> >> the problem is that after some disk replaces and data moves, recovery > >> >> is progressing extremely slowly.. pgs seem to be stuck in > >> >> active+recovering+degraded > >> >> state: > >> >> > >> >> [root@v1d ~]# ceph -s > >> >> cluster a5efbc87-3900-4c42-a977-8c93f7aa8c33 > >> >> health HEALTH_WARN > >> >> 159 pgs backfill_wait > >> >> 4 pgs backfilling > >> >> 259 pgs degraded > >> >> 12 pgs recovering > >> >> 113 pgs recovery_wait > >> >> 215 pgs stuck degraded > >> >> 266 pgs stuck unclean > >> >> 140 pgs stuck undersized > >> >> 151 pgs undersized > >> >> recovery 37788/2327775 objects degraded (1.623%) > >> >> recovery 23854/2327775 objects misplaced (1.025%) > >> >> noout,noin flag(s) set > >> >> monmap e21: 3 mons at > >> >> {v1a=10.0.0.1:6789/0,v1b=10.0.0.2:6789/0,v1c=10.0.0.3:6789/0} > >> >> election epoch 6160, quorum 0,1,2 v1a,v1b,v1c > >> >> fsmap e817: 1/1/1 up {0=v1a=up:active}, 1 up:standby > >> >> osdmap e76002: 26 osds: 26 up, 26 in; 185 remapped pgs > >> >> flags noout,noin,sortbitwise,require_jewel_osds > >> >> pgmap v80995844: 3200 pgs, 4 pools, 2876 GB data, 757 kobjects > >> >> 9215 GB used, 35572 GB / 45365 GB avail > >> >> 37788/2327775 objects degraded (1.623%) > >> >> 23854/2327775 objects misplaced (1.025%) > >> >> 2912 active+clean > >> >> 130 active+undersized+degraded+remapped+wait_backfill > >> >> 97 active+recovery_wait+degraded > >> >> 29 active+remapped+wait_backfill > >> >> 12 active+recovery_wait+undersized+degraded+remapped > >> >> 6 active+recovering+degraded > >> >> 5 active+recovering+undersized+degraded+remapped > >> >> 4 active+undersized+degraded+remapped+backfilling > >> >> 4 active+recovery_wait+degraded+remapped > >> >> 1 active+recovering+degraded+remapped > >> >> client io 2026 B/s rd, 146 kB/s wr, 9 op/s rd, 21 op/s wr > >> >> > >> >> > >> >> when I restart affected OSDs, it bumps the recovery, but then another > >> >> PGs get stuck.. All OSDs were restarted multiple times, none are even > >> >> close to > >> >> nearfull, I just cant find what I'm doing wrong.. > >> >> > >> >> possibly related OSD options: > >> >> > >> >> osd max backfills = 4 > >> >> osd recovery max active = 15 > >> >> debug osd = 0/0 > >> >> osd op threads = 4 > >> >> osd backfill scan min = 4 > >> >> osd backfill scan max = 16 > >> >> > >> >> Any hints would be greatly appreciated > >> >> > >> >> thanks > >> >> > >> >> nik > >> >> > >> >> > >> >> -- > >> >> ------------------------------------- > >> >> Ing. Nikola CIPRICH > >> >> LinuxBox.cz, s.r.o. > >> >> 28.rijna 168, 709 00 Ostrava > >> >> > >> >> tel.: +420 591 166 214 > >> >> fax: +420 596 621 273 > >> >> mobil: +420 777 093 799 > >> >> www.linuxbox.cz > >> >> > >> >> mobil servis: +420 737 238 656 > >> >> email servis: ser...@linuxbox.cz > >> >> ------------------------------------- > >> >> _______________________________________________ > >> >> ceph-users mailing list > >> >> ceph-users@lists.ceph.com > >> >> http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com > >> >> > >> > > >> >-- > >> >------------------------------------- > >> >Ing. Nikola CIPRICH > >> >LinuxBox.cz, s.r.o. > >> >28.rijna 168, 709 00 Ostrava > >> > > >> >tel.: +420 591 166 214 > >> >fax: +420 596 621 273 > >> >mobil: +420 777 093 799 > >> >www.linuxbox.cz > >> > > >> >mobil servis: +420 737 238 656 > >> >email servis: ser...@linuxbox.cz > >> >------------------------------------- > >> >_______________________________________________ > >> >ceph-users mailing list > >> >ceph-users@lists.ceph.com > >> >http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com > > > >-- > >------------------------------------- > >Ing. Nikola CIPRICH > >LinuxBox.cz, s.r.o. > >28.rijna 168, 709 00 Ostrava > > > >tel.: +420 591 166 214 > >fax: +420 596 621 273 > >mobil: +420 777 093 799 > >www.linuxbox.cz > > > >mobil servis: +420 737 238 656 > >email servis: ser...@linuxbox.cz > >------------------------------------- -- ------------------------------------- Ing. Nikola CIPRICH LinuxBox.cz, s.r.o. 28.rijna 168, 709 00 Ostrava tel.: +420 591 166 214 fax: +420 596 621 273 mobil: +420 777 093 799 www.linuxbox.cz mobil servis: +420 737 238 656 email servis: ser...@linuxbox.cz ------------------------------------- _______________________________________________ ceph-users mailing list ceph-users@lists.ceph.com http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com