We have a cluster with 10 servers, 64 OSDs and 5 Mons on them. The OSDs are 3TB 
disk, formatted with btrfs and the servers are either on Ubuntu 12.10 or 13.04.

Recently one of the servers (13.04) stood still (due to problems with btrfs - 
something we have seen a few times). I decided to not try to recover the disks, 
but reformat them with XFS. I removed the OSDs, reformatted, and re-created 
them (they got the same OSD numbers)

I redid this twice (because I wrongly partioned the disks in the first place) 
and I ended up with 2 unfound "pieces" in one pg:

root@s2:~# ceph health details
HEALTH_WARN 1 pgs degraded; 1 pgs recovering; 1 pgs stuck unclean; recovery 
4448/28915270 degraded (0.015%); 2/9854766 unfound (0.000%)
pg 0.cfa is stuck unclean for 1004252.309704, current state 
active+recovering+degraded+remapped, last acting [23,50]
pg 0.cfa is active+recovering+degraded+remapped, acting [23,50], 2 unfound
recovery 4448/28915270 degraded (0.015%); 2/9854766 unfound (0.000%)


root@s2:~# ceph pg 0.cfa query

{ "state": "active+recovering+degraded+remapped",
  "epoch": 28197,
  "up": [
        23,
        50,
        18],
  "acting": [
        23,
        50],
  "info": { "pgid": "0.cfa",
      "last_update": "28082'7774",
      "last_complete": "23686'7083",
      "log_tail": "14360'4061",
      "last_backfill": "MAX",
      "purged_snaps": "[]",
      "history": { "epoch_created": 1,
          "last_epoch_started": 28197,
          "last_epoch_clean": 24810,
          "last_epoch_split": 0,
          "same_up_since": 28195,
          "same_interval_since": 28196,
          "same_primary_since": 26036,
          "last_scrub": "20585'6801",
          "last_scrub_stamp": "2013-07-28 15:40:53.298786",
          "last_deep_scrub": "20585'6801",
          "last_deep_scrub_stamp": "2013-07-28 15:40:53.298786",
          "last_clean_scrub_stamp": "2013-07-28 15:40:53.298786"},
      "stats": { "version": "28082'7774",
          "reported": "28197'41950",
          "state": "active+recovering+degraded+remapped",
          "last_fresh": "2013-08-13 14:34:33.057271",
          "last_change": "2013-08-13 14:34:33.057271",
          "last_active": "2013-08-13 14:34:33.057271",
          "last_clean": "2013-08-01 23:50:18.414082",
          "last_became_active": "2013-05-29 13:10:51.366237",
          "last_unstale": "2013-08-13 14:34:33.057271",
          "mapping_epoch": 28195,
          "log_start": "14360'4061",
          "ondisk_log_start": "14360'4061",
          "created": 1,
          "last_epoch_clean": 24810,
          "parent": "0.0",
          "parent_split_bits": 0,
          "last_scrub": "20585'6801",
          "last_scrub_stamp": "2013-07-28 15:40:53.298786",
          "last_deep_scrub": "20585'6801",
          "last_deep_scrub_stamp": "2013-07-28 15:40:53.298786",
          "last_clean_scrub_stamp": "2013-07-28 15:40:53.298786",
          "log_size": 0,
          "ondisk_log_size": 0,
          "stats_invalid": "0",
          "stat_sum": { "num_bytes": 145307402,
              "num_objects": 2234,
              "num_object_clones": 0,
              "num_object_copies": 0,
              "num_objects_missing_on_primary": 0,
              "num_objects_degraded": 0,
              "num_objects_unfound": 0,
              "num_read": 744,
              "num_read_kb": 410184,
              "num_write": 7774,
              "num_write_kb": 1155438,
              "num_scrub_errors": 0,
              "num_shallow_scrub_errors": 0,
              "num_deep_scrub_errors": 0,
              "num_objects_recovered": 3998,
              "num_bytes_recovered": 278803622,
              "num_keys_recovered": 0},
          "stat_cat_sum": {},
          "up": [
                23,
                50,
                18],
          "acting": [
                23,
                50]},
      "empty": 0,
      "dne": 0,
      "incomplete": 0,
      "last_epoch_started": 28197},
  "recovery_state": [
        { "name": "Started\/Primary\/Active",
          "enter_time": "2013-08-13 14:34:33.026698",
          "might_have_unfound": [
                { "osd": 9,
                  "status": "querying"},
                { "osd": 18,
                  "status": "querying"},
                { "osd": 50,
                  "status": "already probed"}],
          "recovery_progress": { "backfill_target": 50,
              "waiting_on_backfill": 0,
              "backfill_pos": "96220cfa\/10000799e82.00000000\/head\/\/0",
              "backfill_info": { "begin": "0\/\/0\/\/-1",
                  "end": "0\/\/0\/\/-1",
                  "objects": []},
              "peer_backfill_info": { "begin": "0\/\/0\/\/-1",
                  "end": "0\/\/0\/\/-1",
                  "objects": []},
              "backfills_in_flight": [],
              "pull_from_peer": [],
              "pushing": []},
          "scrub": { "scrubber.epoch_start": "0",
              "scrubber.active": 0,
              "scrubber.block_writes": 0,
              "scrubber.finalizing": 0,
              "scrubber.waiting_on": 0,
              "scrubber.waiting_on_whom": []}},
        { "name": "Started",
          "enter_time": "2013-08-13 14:34:32.024282"}]}

I have tried to mark those two pieces as lost, but ceph wouldn't let me (due to 
the fact that it is still in querying state on osd 9 and 18). I have restarted 
the OSDs, but I can't force any other status change.

What next? Take the OSDs (9, 18) out again and rebuilding?

thanks for your help
Jens-Christian


-- 
SWITCH
Jens-Christian Fischer, Peta Solutions
Werdstrasse 2, P.O. Box, 8021 Zurich, Switzerland
phone +41 44 268 15 15, direct +41 44 268 15 71
jens-christian.fisc...@switch.ch
http://www.switch.ch

http://www.switch.ch/socialmedia

_______________________________________________
ceph-users mailing list
ceph-users@lists.ceph.com
http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com

Reply via email to