Hi!

I just had the time to check again: even after removing the broken
OSD, mgr still crashes.
All OSDs are on and in.
If I run "ceph balancer on" on a HEALTH_OK cluster, an optimization
plan is generated and started. After some minutes all MGRs die.

This is a major problem for me, as I still got that SSD OSD that is
inbalanced and limiting the whole pools space.


root@adminnode:~# ceph osd tree
ID  CLASS WEIGHT   TYPE NAME                     STATUS REWEIGHT PRI-AFF
 -1       29.91933 root default
-16       29.91933     datacenter dc01
-19       29.91933         pod dc01-agg01
-10       16.52396             rack dc01-rack02
 -4        6.29695                 host node1001
  0   hdd  0.90999                     osd.0         up  1.00000 1.00000
  1   hdd  0.90999                     osd.1         up  1.00000 1.00000
  5   hdd  0.90999                     osd.5         up  1.00000 1.00000
 29   hdd  0.90970                     osd.29        up  1.00000 1.00000
 33   hdd  0.90970                     osd.33        up  1.00000 1.00000
  2   ssd  0.43700                     osd.2         up  1.00000 1.00000
  3   ssd  0.43700                     osd.3         up  1.00000 1.00000
  4   ssd  0.43700                     osd.4         up  1.00000 1.00000
 30   ssd  0.43660                     osd.30        up  1.00000 1.00000
 -7        6.29724                 host node1002
  9   hdd  0.90999                     osd.9         up  1.00000 1.00000
 10   hdd  0.90999                     osd.10        up  1.00000 1.00000
 11   hdd  0.90999                     osd.11        up  1.00000 1.00000
 12   hdd  0.90999                     osd.12        up  1.00000 1.00000
 35   hdd  0.90970                     osd.35        up  1.00000 1.00000
  6   ssd  0.43700                     osd.6         up  1.00000 1.00000
  7   ssd  0.43700                     osd.7         up  1.00000 1.00000
  8   ssd  0.43700                     osd.8         up  1.00000 1.00000
 31   ssd  0.43660                     osd.31        up  1.00000 1.00000
-28        2.18318                 host node1005
 34   ssd  0.43660                     osd.34        up  1.00000 1.00000
 36   ssd  0.87329                     osd.36        up  1.00000 1.00000
 37   ssd  0.87329                     osd.37        up  1.00000 1.00000
-29        1.74658                 host node1006
 42   ssd  0.87329                     osd.42        up  1.00000 1.00000
 43   ssd  0.87329                     osd.43        up  1.00000 1.00000
-11       13.39537             rack dc01-rack03
-22        5.38794                 host node1003
 17   hdd  0.90999                     osd.17        up  1.00000 1.00000
 18   hdd  0.90999                     osd.18        up  1.00000 1.00000
 24   hdd  0.90999                     osd.24        up  1.00000 1.00000
 26   hdd  0.90999                     osd.26        up  1.00000 1.00000
 13   ssd  0.43700                     osd.13        up  1.00000 1.00000
 14   ssd  0.43700                     osd.14        up  1.00000 1.00000
 15   ssd  0.43700                     osd.15        up  1.00000 1.00000
 16   ssd  0.43700                     osd.16        up  1.00000 1.00000
-25        5.38765                 host node1004
 23   hdd  0.90999                     osd.23        up  1.00000 1.00000
 25   hdd  0.90999                     osd.25        up  1.00000 1.00000
 27   hdd  0.90999                     osd.27        up  1.00000 1.00000
 28   hdd  0.90970                     osd.28        up  1.00000 1.00000
 19   ssd  0.43700                     osd.19        up  1.00000 1.00000
 20   ssd  0.43700                     osd.20        up  1.00000 1.00000
 21   ssd  0.43700                     osd.21        up  1.00000 1.00000
 22   ssd  0.43700                     osd.22        up  1.00000 1.00000
-30        2.61978                 host node1007
 38   ssd  0.43660                     osd.38        up  1.00000 1.00000
 39   ssd  0.43660                     osd.39        up  1.00000 1.00000
 40   ssd  0.87329                     osd.40        up  1.00000 1.00000
 41   ssd  0.87329                     osd.41        up  1.00000 1.00000



root@adminnode:~# ceph osd df
ID CLASS WEIGHT  REWEIGHT SIZE    USE     AVAIL   %USE  VAR  PGS
 0   hdd 0.90999  1.00000  932GiB  353GiB  579GiB 37.87 0.83  95
 1   hdd 0.90999  1.00000  932GiB  400GiB  531GiB 42.98 0.94 108
 5   hdd 0.90999  1.00000  932GiB  267GiB  664GiB 28.70 0.63  72
29   hdd 0.90970  1.00000  932GiB  356GiB  576GiB 38.19 0.84  96
33   hdd 0.90970  1.00000  932GiB  344GiB  587GiB 36.94 0.81  93
 2   ssd 0.43700  1.00000  447GiB  273GiB  174GiB 61.09 1.34  52
 3   ssd 0.43700  1.00000  447GiB  252GiB  195GiB 56.38 1.23  61
 4   ssd 0.43700  1.00000  447GiB  308GiB  140GiB 68.78 1.51  59
30   ssd 0.43660  1.00000  447GiB  231GiB  216GiB 51.77 1.13  48
 9   hdd 0.90999  1.00000  932GiB  358GiB  573GiB 38.48 0.84  97
10   hdd 0.90999  1.00000  932GiB  347GiB  585GiB 37.25 0.82  94
11   hdd 0.90999  1.00000  932GiB  335GiB  597GiB 35.96 0.79  91
12   hdd 0.90999  1.00000  932GiB  357GiB  575GiB 38.28 0.84  96
35   hdd 0.90970  1.00000  932GiB  318GiB  614GiB 34.14 0.75  86
 6   ssd 0.43700  1.00000  447GiB  278GiB  170GiB 62.08 1.36  63
 7   ssd 0.43700  1.00000  447GiB  256GiB  191GiB 57.17 1.25  60
 8   ssd 0.43700  1.00000  447GiB  291GiB  156GiB 65.01 1.42  57
31   ssd 0.43660  1.00000  447GiB  246GiB  201GiB 54.96 1.20  51
34   ssd 0.43660  1.00000  447GiB  189GiB  258GiB 42.22 0.92  46
36   ssd 0.87329  1.00000  894GiB  389GiB  506GiB 43.45 0.95  91
37   ssd 0.87329  1.00000  894GiB  390GiB  504GiB 43.63 0.96  85
42   ssd 0.87329  1.00000  894GiB  401GiB  493GiB 44.88 0.98  92
43   ssd 0.87329  1.00000  894GiB  455GiB  439GiB 50.89 1.11  89
17   hdd 0.90999  1.00000  932GiB  368GiB  563GiB 39.55 0.87 100
18   hdd 0.90999  1.00000  932GiB  350GiB  582GiB 37.56 0.82  95
24   hdd 0.90999  1.00000  932GiB  359GiB  572GiB 38.58 0.84  97
26   hdd 0.90999  1.00000  932GiB  388GiB  544GiB 41.62 0.91 105
13   ssd 0.43700  1.00000  447GiB  322GiB  125GiB 72.12 1.58  80
14   ssd 0.43700  1.00000  447GiB  291GiB  156GiB 65.16 1.43  70
15   ssd 0.43700  1.00000  447GiB  350GiB 96.9GiB 78.33 1.72  78 <--
16   ssd 0.43700  1.00000  447GiB  268GiB  179GiB 60.05 1.31  71
23   hdd 0.90999  1.00000  932GiB  364GiB  567GiB 39.08 0.86  98
25   hdd 0.90999  1.00000  932GiB  391GiB  541GiB 41.92 0.92 106
27   hdd 0.90999  1.00000  932GiB  393GiB  538GiB 42.21 0.92 106
28   hdd 0.90970  1.00000  932GiB  467GiB  464GiB 50.14 1.10 126
19   ssd 0.43700  1.00000  447GiB  310GiB  137GiB 69.36 1.52  76
20   ssd 0.43700  1.00000  447GiB  316GiB  131GiB 70.66 1.55  76
21   ssd 0.43700  1.00000  447GiB  323GiB  125GiB 72.13 1.58  80
22   ssd 0.43700  1.00000  447GiB  283GiB  164GiB 63.39 1.39  69
38   ssd 0.43660  1.00000  447GiB  146GiB  302GiB 32.55 0.71  46
39   ssd 0.43660  1.00000  447GiB  142GiB  305GiB 31.84 0.70  43
40   ssd 0.87329  1.00000  894GiB  407GiB  487GiB 45.53 1.00  98
41   ssd 0.87329  1.00000  894GiB  353GiB  541GiB 39.51 0.87 102
                    TOTAL 29.9TiB 13.7TiB 16.3TiB 45.66
MIN/MAX VAR: 0.63/1.72  STDDEV: 13.59




Kevin

Am So., 6. Jan. 2019 um 07:34 Uhr schrieb Konstantin Shalygin <k0...@k0ste.ru>:
>
> On 1/5/19 4:17 PM, Kevin Olbrich wrote:
> > root@adminnode:~# ceph osd tree
> > ID  CLASS WEIGHT   TYPE NAME                     STATUS REWEIGHT PRI-AFF
> >   -1       30.82903 root default
> > -16       30.82903     datacenter dc01
> > -19       30.82903         pod dc01-agg01
> > -10       17.43365             rack dc01-rack02
> >   -4        7.20665                 host node1001
> >    0   hdd  0.90999                     osd.0         up  1.00000 1.00000
> >    1   hdd  0.90999                     osd.1         up  1.00000 1.00000
> >    5   hdd  0.90999                     osd.5         up  1.00000 1.00000
> >   29   hdd  0.90970                     osd.29        up  1.00000 1.00000
> >   32   hdd  0.90970                     osd.32      down        0 1.00000
> >   33   hdd  0.90970                     osd.33        up  1.00000 1.00000
> >    2   ssd  0.43700                     osd.2         up  1.00000 1.00000
> >    3   ssd  0.43700                     osd.3         up  1.00000 1.00000
> >    4   ssd  0.43700                     osd.4         up  1.00000 1.00000
> >   30   ssd  0.43660                     osd.30        up  1.00000 1.00000
> >   -7        6.29724                 host node1002
> >    9   hdd  0.90999                     osd.9         up  1.00000 1.00000
> >   10   hdd  0.90999                     osd.10        up  1.00000 1.00000
> >   11   hdd  0.90999                     osd.11        up  1.00000 1.00000
> >   12   hdd  0.90999                     osd.12        up  1.00000 1.00000
> >   35   hdd  0.90970                     osd.35        up  1.00000 1.00000
> >    6   ssd  0.43700                     osd.6         up  1.00000 1.00000
> >    7   ssd  0.43700                     osd.7         up  1.00000 1.00000
> >    8   ssd  0.43700                     osd.8         up  1.00000 1.00000
> >   31   ssd  0.43660                     osd.31        up  1.00000 1.00000
> > -28        2.18318                 host node1005
> >   34   ssd  0.43660                     osd.34        up  1.00000 1.00000
> >   36   ssd  0.87329                     osd.36        up  1.00000 1.00000
> >   37   ssd  0.87329                     osd.37        up  1.00000 1.00000
> > -29        1.74658                 host node1006
> >   42   ssd  0.87329                     osd.42        up  1.00000 1.00000
> >   43   ssd  0.87329                     osd.43        up  1.00000 1.00000
> > -11       13.39537             rack dc01-rack03
> > -22        5.38794                 host node1003
> >   17   hdd  0.90999                     osd.17        up  1.00000 1.00000
> >   18   hdd  0.90999                     osd.18        up  1.00000 1.00000
> >   24   hdd  0.90999                     osd.24        up  1.00000 1.00000
> >   26   hdd  0.90999                     osd.26        up  1.00000 1.00000
> >   13   ssd  0.43700                     osd.13        up  1.00000 1.00000
> >   14   ssd  0.43700                     osd.14        up  1.00000 1.00000
> >   15   ssd  0.43700                     osd.15        up  1.00000 1.00000
> >   16   ssd  0.43700                     osd.16        up  1.00000 1.00000
> > -25        5.38765                 host node1004
> >   23   hdd  0.90999                     osd.23        up  1.00000 1.00000
> >   25   hdd  0.90999                     osd.25        up  1.00000 1.00000
> >   27   hdd  0.90999                     osd.27        up  1.00000 1.00000
> >   28   hdd  0.90970                     osd.28        up  1.00000 1.00000
> >   19   ssd  0.43700                     osd.19        up  1.00000 1.00000
> >   20   ssd  0.43700                     osd.20        up  1.00000 1.00000
> >   21   ssd  0.43700                     osd.21        up  1.00000 1.00000
> >   22   ssd  0.43700                     osd.22        up  1.00000 1.00000
> > -30        2.61978                 host node1007
> >   38   ssd  0.43660                     osd.38        up  1.00000 1.00000
> >   39   ssd  0.43660                     osd.39        up  1.00000 1.00000
> >   40   ssd  0.87329                     osd.40        up  1.00000 1.00000
> >   41   ssd  0.87329                     osd.41        up  1.00000 1.00000
> >
> > ========================================================
> > root@adminnode:~# ceph osd df tree
> > ID  CLASS WEIGHT   REWEIGHT SIZE    USE     AVAIL   %USE  VAR  PGS
> > TYPE NAME
> >   -1       30.82903        - 29.9TiB 14.0TiB 16.0TiB 46.65 1.00   -
> > root default
> > -16       30.82903        - 29.9TiB 14.0TiB 16.0TiB 46.65 1.00   -
> > datacenter dc01
> > -19       30.82903        - 29.9TiB 14.0TiB 16.0TiB 46.65 1.00   -
> >      pod dc01-agg01
> > -10       17.43365        - 16.5TiB 7.31TiB 9.21TiB 44.26 0.95   -
> >          rack dc01-rack02
> >   -4        7.20665        - 6.29TiB 2.76TiB 3.54TiB 43.83 0.94   -
> >              host node1001
> >    0   hdd  0.90999  1.00000  932GiB  356GiB  575GiB 38.22 0.82  95
> >                  osd.0
> >    1   hdd  0.90999  1.00000  932GiB  397GiB  534GiB 42.66 0.91 106
> >                  osd.1
> >    5   hdd  0.90999  1.00000  932GiB  284GiB  647GiB 30.50 0.65  76
> >                  osd.5
> >   29   hdd  0.90970  1.00000  932GiB  366GiB  566GiB 39.29 0.84  98
> >                  osd.29
> >   32   hdd  0.90970        0      0B      0B      0B     0    0   0
> >                  osd.32
> >   33   hdd  0.90970  1.00000  932GiB  369GiB  563GiB 39.57 0.85  99
> >                  osd.33
> >    2   ssd  0.43700  1.00000  447GiB  271GiB  176GiB 60.67 1.30  50
> >                  osd.2
> >    3   ssd  0.43700  1.00000  447GiB  249GiB  198GiB 55.62 1.19  58
> >                  osd.3
> >    4   ssd  0.43700  1.00000  447GiB  297GiB  150GiB 66.39 1.42  56
> >                  osd.4
> >   30   ssd  0.43660  1.00000  447GiB  236GiB  211GiB 52.85 1.13  48
> >                  osd.30
> >   -7        6.29724        - 6.29TiB 2.74TiB 3.55TiB 43.53 0.93   -
> >              host node1002
> >    9   hdd  0.90999  1.00000  932GiB  354GiB  578GiB 37.96 0.81  95
> >                  osd.9
> >   10   hdd  0.90999  1.00000  932GiB  357GiB  575GiB 38.28 0.82  96
> >                  osd.10
> >   11   hdd  0.90999  1.00000  932GiB  318GiB  613GiB 34.18 0.73  86
> >                  osd.11
> >   12   hdd  0.90999  1.00000  932GiB  373GiB  558GiB 40.09 0.86 100
> >                  osd.12
> >   35   hdd  0.90970  1.00000  932GiB  343GiB  588GiB 36.83 0.79  92
> >                  osd.35
> >    6   ssd  0.43700  1.00000  447GiB  269GiB  178GiB 60.20 1.29  60
> >                  osd.6
> >    7   ssd  0.43700  1.00000  447GiB  249GiB  198GiB 55.69 1.19  56
> >                  osd.7
> >    8   ssd  0.43700  1.00000  447GiB  286GiB  161GiB 63.95 1.37  56
> >                  osd.8
> >   31   ssd  0.43660  1.00000  447GiB  257GiB  190GiB 57.47 1.23  55
> >                  osd.31
> > -28        2.18318        - 2.18TiB  968GiB 1.24TiB 43.29 0.93   -
> >              host node1005
> >   34   ssd  0.43660  1.00000  447GiB  202GiB  245GiB 45.14 0.97  47
> >                  osd.34
> >   36   ssd  0.87329  1.00000  894GiB  405GiB  489GiB 45.28 0.97  91
> >                  osd.36
> >   37   ssd  0.87329  1.00000  894GiB  361GiB  533GiB 40.38 0.87  79
> >                  osd.37
> > -29        1.74658        - 1.75TiB  888GiB  900GiB 49.65 1.06   -
> >              host node1006
> >   42   ssd  0.87329  1.00000  894GiB  417GiB  477GiB 46.68 1.00  92
> >                  osd.42
> >   43   ssd  0.87329  1.00000  894GiB  471GiB  424GiB 52.63 1.13  90
> >                  osd.43
> > -11       13.39537        - 13.4TiB 6.64TiB 6.75TiB 49.60 1.06   -
> >          rack dc01-rack03
> > -22        5.38794        - 5.39TiB 2.70TiB 2.69TiB 50.14 1.07   -
> >              host node1003
> >   17   hdd  0.90999  1.00000  932GiB  371GiB  560GiB 39.83 0.85 100
> >                  osd.17
> >   18   hdd  0.90999  1.00000  932GiB  390GiB  542GiB 41.82 0.90 105
> >                  osd.18
> >   24   hdd  0.90999  1.00000  932GiB  352GiB  580GiB 37.77 0.81  94
> >                  osd.24
> >   26   hdd  0.90999  1.00000  932GiB  387GiB  545GiB 41.54 0.89 104
> >                  osd.26
> >   13   ssd  0.43700  1.00000  447GiB  319GiB  128GiB 71.32 1.53  77
> >                  osd.13
> >   14   ssd  0.43700  1.00000  447GiB  303GiB  144GiB 67.76 1.45  70
> >                  osd.14
> >   15   ssd  0.43700  1.00000  447GiB  361GiB 86.4GiB 80.67 1.73  77
> >                  osd.15
> >   16   ssd  0.43700  1.00000  447GiB  283GiB  164GiB 63.29 1.36  71
> >                  osd.16
> > -25        5.38765        - 5.39TiB 2.83TiB 2.56TiB 52.55 1.13   -
> >              host node1004
> >   23   hdd  0.90999  1.00000  932GiB  382GiB  549GiB 41.05 0.88 102
> >                  osd.23
> >   25   hdd  0.90999  1.00000  932GiB  412GiB  520GiB 44.20 0.95 111
> >                  osd.25
> >   27   hdd  0.90999  1.00000  932GiB  385GiB  546GiB 41.36 0.89 103
> >                  osd.27
> >   28   hdd  0.90970  1.00000  932GiB  462GiB  469GiB 49.64 1.06 124
> >                  osd.28
> >   19   ssd  0.43700  1.00000  447GiB  314GiB  133GiB 70.22 1.51  75
> >                  osd.19
> >   20   ssd  0.43700  1.00000  447GiB  327GiB  120GiB 73.06 1.57  76
> >                  osd.20
> >   21   ssd  0.43700  1.00000  447GiB  324GiB  123GiB 72.45 1.55  77
> >                  osd.21
> >   22   ssd  0.43700  1.00000  447GiB  292GiB  156GiB 65.21 1.40  68
> >                  osd.22
> > -30        2.61978        - 2.62TiB 1.11TiB 1.51TiB 42.43 0.91   -
> >              host node1007
> >   38   ssd  0.43660  1.00000  447GiB  165GiB  283GiB 36.82 0.79  46
> >                  osd.38
> >   39   ssd  0.43660  1.00000  447GiB  156GiB  292GiB 34.79 0.75  42
> >                  osd.39
> >   40   ssd  0.87329  1.00000  894GiB  429GiB  466GiB 47.94 1.03  98
> >                  osd.40
> >   41   ssd  0.87329  1.00000  894GiB  389GiB  505GiB 43.55 0.93 103
> >                  osd.41
> >                        TOTAL 29.9TiB 14.0TiB 16.0TiB 46.65
> > MIN/MAX VAR: 0.65/1.73  STDDEV: 13.30
> >
> > =============================================================
> > root@adminnode:~# ceph df && ceph -v
> > GLOBAL:
> >      SIZE        AVAIL       RAW USED     %RAW USED
> >      29.9TiB     16.0TiB      14.0TiB         46.65
> > POOLS:
> >      NAME                  ID     USED        %USED     MAX AVAIL     
> > OBJECTS
> >      rbd_vms_ssd           2       986GiB     49.83        993GiB      
> > 262606
> >      rbd_vms_hdd           3      3.76TiB     48.94       3.92TiB      
> > 992255
> >      rbd_vms_ssd_01        4       372KiB         0        662GiB         
> > 148
> >      rbd_vms_ssd_01_ec     6      2.85TiB     68.81       1.29TiB      
> > 770506
> >
> > ceph version 12.2.8 (ae699615bac534ea496ee965ac6192cb7e0e07c0) luminous 
> > (stable)
>
> Looks good. You should always delete your down osd's from crush map
> before replace it. After delete this osd try balancer again.
>
>
>
> k
>
_______________________________________________
ceph-users mailing list
ceph-users@lists.ceph.com
http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com

Reply via email to