Re: still recovery issues with cuttlefish

2013-08-02 Thread Stefan Priebe

Am 01.08.2013 23:23, schrieb Samuel Just: Can you dump your osd settings?
 sudo ceph --admin-daemon ceph-osd.osdid.asok config show

Sure.



{ name: osd.0,
  cluster: ceph,
  none: 0\/5,
  lockdep: 0\/0,
  context: 0\/0,
  crush: 0\/0,
  mds: 0\/0,
  mds_balancer: 0\/0,
  mds_locker: 0\/0,
  mds_log: 0\/0,
  mds_log_expire: 0\/0,
  mds_migrator: 0\/0,
  buffer: 0\/0,
  timer: 0\/0,
  filer: 0\/0,
  striper: 0\/1,
  objecter: 0\/0,
  rados: 0\/0,
  rbd: 0\/0,
  journaler: 0\/0,
  objectcacher: 0\/0,
  client: 0\/0,
  osd: 0\/0,
  optracker: 0\/0,
  objclass: 0\/0,
  filestore: 0\/0,
  journal: 0\/0,
  ms: 0\/0,
  mon: 0\/0,
  monc: 0\/0,
  paxos: 0\/0,
  tp: 0\/0,
  auth: 0\/0,
  crypto: 1\/5,
  finisher: 0\/0,
  heartbeatmap: 0\/0,
  perfcounter: 0\/0,
  rgw: 0\/0,
  hadoop: 0\/0,
  javaclient: 1\/5,
  asok: 0\/0,
  throttle: 0\/0,
  host: cloud1-1268,
  fsid: ----,
  public_addr: 10.255.0.90:0\/0,
  cluster_addr: 10.255.0.90:0\/0,
  public_network: 10.255.0.1\/24,
  cluster_network: 10.255.0.1\/24,
  num_client: 1,
  monmap: ,
  mon_host: ,
  lockdep: false,
  run_dir: \/var\/run\/ceph,
  admin_socket: \/var\/run\/ceph\/ceph-osd.0.asok,
  daemonize: true,
  pid_file: \/var\/run\/ceph\/osd.0.pid,
  chdir: \/,
  max_open_files: 0,
  fatal_signal_handlers: true,
  log_file: \/var\/log\/ceph\/ceph-osd.0.log,
  log_max_new: 1000,
  log_max_recent: 1,
  log_to_stderr: false,
  err_to_stderr: true,
  log_to_syslog: false,
  err_to_syslog: false,
  log_flush_on_exit: true,
  log_stop_at_utilization: 0.97,
  clog_to_monitors: true,
  clog_to_syslog: false,
  clog_to_syslog_level: info,
  clog_to_syslog_facility: daemon,
  mon_cluster_log_to_syslog: false,
  mon_cluster_log_to_syslog_level: info,
  mon_cluster_log_to_syslog_facility: daemon,
  mon_cluster_log_file: \/var\/log\/ceph\/ceph.log,
  key: ,
  keyfile: ,
  keyring: \/etc\/ceph\/osd.0.keyring,
  heartbeat_interval: 5,
  heartbeat_file: ,
  heartbeat_inject_failure: 0,
  perf: true,
  ms_tcp_nodelay: true,
  ms_tcp_rcvbuf: 0,
  ms_initial_backoff: 0.2,
  ms_max_backoff: 15,
  ms_nocrc: false,
  ms_die_on_bad_msg: false,
  ms_die_on_unhandled_msg: false,
  ms_dispatch_throttle_bytes: 104857600,
  ms_bind_ipv6: false,
  ms_bind_port_min: 6800,
  ms_bind_port_max: 7100,
  ms_rwthread_stack_bytes: 1048576,
  ms_tcp_read_timeout: 900,
  ms_pq_max_tokens_per_priority: 4194304,
  ms_pq_min_cost: 65536,
  ms_inject_socket_failures: 0,
  ms_inject_delay_type: ,
  ms_inject_delay_max: 1,
  ms_inject_delay_probability: 0,
  ms_inject_internal_delays: 0,
  mon_data: \/var\/lib\/ceph\/mon\/ceph-0,
  mon_initial_members: ,
  mon_sync_fs_threshold: 5,
  mon_compact_on_start: false,
  mon_compact_on_bootstrap: false,
  mon_compact_on_trim: true,
  mon_tick_interval: 5,
  mon_subscribe_interval: 300,
  mon_osd_laggy_halflife: 3600,
  mon_osd_laggy_weight: 0.3,
  mon_osd_adjust_heartbeat_grace: true,
  mon_osd_adjust_down_out_interval: true,
  mon_osd_auto_mark_in: false,
  mon_osd_auto_mark_auto_out_in: true,
  mon_osd_auto_mark_new_in: true,
  mon_osd_down_out_interval: 300,
  mon_osd_down_out_subtree_limit: rack,
  mon_osd_min_up_ratio: 0.3,
  mon_osd_min_in_ratio: 0.3,
  mon_stat_smooth_intervals: 2,
  mon_lease: 5,
  mon_lease_renew_interval: 3,
  mon_lease_ack_timeout: 10,
  mon_clock_drift_allowed: 0.05,
  mon_clock_drift_warn_backoff: 5,
  mon_timecheck_interval: 300,
  mon_accept_timeout: 10,
  mon_pg_create_interval: 30,
  mon_pg_stuck_threshold: 300,
  mon_osd_full_ratio: 0.95,
  mon_osd_nearfull_ratio: 0.85,
  mon_globalid_prealloc: 100,
  mon_osd_report_timeout: 900,
  mon_force_standby_active: true,
  mon_min_osdmap_epochs: 500,
  mon_max_pgmap_epochs: 500,
  mon_max_log_epochs: 500,
  mon_max_osd: 1,
  mon_probe_timeout: 2,
  mon_slurp_timeout: 10,
  mon_slurp_bytes: 262144,
  mon_client_bytes: 104857600,
  mon_daemon_bytes: 419430400,
  mon_max_log_entries_per_event: 4096,
  mon_health_data_update_interval: 60,
  mon_data_avail_crit: 5,
  mon_data_avail_warn: 30,
  mon_config_key_max_entry_size: 4096,
  mon_sync_trim_timeout: 30,
  mon_sync_heartbeat_timeout: 30,
  mon_sync_heartbeat_interval: 5,
  mon_sync_backoff_timeout: 30,
  mon_sync_timeout: 30,
  mon_sync_max_retries: 5,
  mon_sync_max_payload_size: 1048576,
  mon_sync_debug: false,
  mon_sync_debug_leader: -1,
  mon_sync_debug_provider: -1,
  mon_sync_debug_provider_fallback: -1,
  mon_debug_dump_transactions: false,
  mon_debug_dump_location: \/var\/log\/ceph\/ceph-osd.0.tdump,
  mon_sync_leader_kill_at: 0,
  mon_sync_provider_kill_at: 0,
  mon_sync_requester_kill_at: 0,
  mon_leveldb_write_buffer_size: 33554432,
  mon_leveldb_cache_size: 268435456,
  mon_leveldb_block_size: 65536,
  mon_leveldb_bloom_size: 0,
  mon_leveldb_max_open_files: 0,
  mon_leveldb_compression: false,
  mon_leveldb_paranoid: false,
  mon_leveldb_log: ,
  paxos_stash_full_interval: 25,
  paxos_max_join_drift: 100,
  paxos_propose_interval: 1,
  paxos_min_wait: 0.05,
  paxos_min: 500,
 

Re: Rados Protocoll

2013-08-02 Thread Niklas Goerke

Hi Noah

Thank you for your comments.
My companies policy states that all software needs to go through a 
security assessment to be allowed to use in production. As all our tools 
are focused on java, a native implementation would be far easier to 
handle than a java binding of librados (which uses jna, which is also 
quite large).
Also with a native java implementation there is no technology-break 
using three different software packages from three maintainers as 
compared to one native java implementation which only relies on the 
RADOS Protocol.


As for the documentation you referenced: I didn't find a documentation 
of the RADOS Protocol which could be used to base an implementation of 
librados upon. Does anything like this exist, or would I need to 
translate the c implementation?


Niklas


Am 01.08.2013 23:53, schrieb Noah Watkins:

Hi Niklas,

The RADOS reference implementation in C++ is quite large. Reproducing
it all in another language would be interesting, but I'm curious if
wrapping the C interface is not an option for you? There are Java
bindings that are being worked on here:
https://github.com/wido/rados-java.

There are links on ceph.com/docs to some information about Ceph, as
well as videos on Youtube, and academic papers linked to.

-Noah

On Thu, Aug 1, 2013 at 1:01 PM, Niklas Goerke nik...@niklasgoerke.de wrote:

Hi,

I was wondering why there is no native Java implementation of librados. I'm
thinking about creating one and I'm thus looking for a documentation of the
RADOS protocol.
Also the way I see it librados implements the crush algorithm. Is there a
documentation for it?
Also an educated guess about whether the RADOS Protocol is due to changes
would be very much appreciated.

Thank you in advance

Niklas
--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH V5 2/8] fs/ceph: vfs __set_page_dirty_nobuffers interface instead of doing it inside filesystem

2013-08-02 Thread Sha Zhengju
On Thu, Aug 1, 2013 at 11:19 PM, Yan, Zheng uker...@gmail.com wrote:
 On Thu, Aug 1, 2013 at 7:51 PM, Sha Zhengju handai@gmail.com wrote:
 From: Sha Zhengju handai@taobao.com

 Following we will begin to add memcg dirty page accounting around
 __set_page_dirty_
 {buffers,nobuffers} in vfs layer, so we'd better use vfs interface to
 avoid exporting
 those details to filesystems.

 Signed-off-by: Sha Zhengju handai@taobao.com
 ---
  fs/ceph/addr.c |   13 +
  1 file changed, 1 insertion(+), 12 deletions(-)

 diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
 index 3e68ac1..1445bf1 100644
 --- a/fs/ceph/addr.c
 +++ b/fs/ceph/addr.c
 @@ -76,7 +76,7 @@ static int ceph_set_page_dirty(struct page *page)
 if (unlikely(!mapping))
 return !TestSetPageDirty(page);

 -   if (TestSetPageDirty(page)) {
 +   if (!__set_page_dirty_nobuffers(page)) {

 it's too early to set the radix tree tag here. We should set page's snapshot
 context and increase the i_wrbuffer_ref first. This is because once the tag
 is set, writeback thread can find and start flushing the page.

OK, thanks for pointing it out.



 dout(%p set_page_dirty %p idx %lu -- already dirty\n,
  mapping-host, page, page-index);
 return 0;
 @@ -107,14 +107,7 @@ static int ceph_set_page_dirty(struct page *page)
  snapc, snapc-seq, snapc-num_snaps);
 spin_unlock(ci-i_ceph_lock);

 -   /* now adjust page */
 -   spin_lock_irq(mapping-tree_lock);
 if (page-mapping) {/* Race with truncate? */
 -   WARN_ON_ONCE(!PageUptodate(page));
 -   account_page_dirtied(page, page-mapping);
 -   radix_tree_tag_set(mapping-page_tree,
 -   page_index(page), PAGECACHE_TAG_DIRTY);
 -

 this code was coped from __set_page_dirty_nobuffers(). I think the reason
 Sage did this is to handle the race described in
 __set_page_dirty_nobuffers()'s comment. But I'm wonder if page-mapping ==
 NULL can still happen here. Because truncate_inode_page() unmap page from
 processes's address spaces first, then delete page from page cache.

But in non-mmap case, doesn't it has no relation to 'unmap page from
address spaces'?
The check is exactly avoiding racy with delete_from_page_cache(),
since the two both need to hold mapping-tree_lock, and if truncate
goes first then __set_page_dirty_nobuffers() may have NULL mapping.


Thanks,
Sha
--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] ceph: Add check returned value on func ceph_calc_ceph_pg.

2013-08-02 Thread majianpeng
Func ceph_calc_ceph_pg maybe failed.So add check for returned value.

Signed-off-by: Jianpeng Ma majianp...@gmail.com
---
 fs/ceph/ioctl.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index e0b4ef3..8c463dd 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -209,8 +209,12 @@ static long ceph_ioctl_get_dataloc(struct file *file, void 
__user *arg)
snprintf(dl.object_name, sizeof(dl.object_name), %llx.%08llx,
 ceph_ino(inode), dl.object_no);
 
-   ceph_calc_ceph_pg(pgid, dl.object_name, osdc-osdmap,
-   ceph_file_layout_pg_pool(ci-i_layout));
+   r = ceph_calc_ceph_pg(pgid, dl.object_name, osdc-osdmap,
+   ceph_file_layout_pg_pool(ci-i_layout));
+   if (r  0) {
+   up_read(osdc-map_sem);
+   return r;
+   }
 
dl.osd = ceph_calc_pg_primary(osdc-osdmap, pgid);
if (dl.osd = 0) {
-- 
1.8.1.2


Thanks!
Jianpeng MaN�Р骒r��yb�X�肚�v�^�)藓{.n�+���z�]z鳐�{ay��,j��f"�h���z��wア�
⒎�j:+v���w�j�m��赙zZ+�茛j��!�i

Re: [PATCH V5 2/8] fs/ceph: vfs __set_page_dirty_nobuffers interface instead of doing it inside filesystem

2013-08-02 Thread Sha Zhengju
On Fri, Aug 2, 2013 at 2:27 AM, Sage Weil s...@inktank.com wrote:
 On Thu, 1 Aug 2013, Yan, Zheng wrote:
 On Thu, Aug 1, 2013 at 7:51 PM, Sha Zhengju handai@gmail.com wrote:
  From: Sha Zhengju handai@taobao.com
 
  Following we will begin to add memcg dirty page accounting around
 __set_page_dirty_
  {buffers,nobuffers} in vfs layer, so we'd better use vfs interface to
 avoid exporting
  those details to filesystems.
 
  Signed-off-by: Sha Zhengju handai@taobao.com
  ---
   fs/ceph/addr.c |   13 +
   1 file changed, 1 insertion(+), 12 deletions(-)
 
  diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
  index 3e68ac1..1445bf1 100644
  --- a/fs/ceph/addr.c
  +++ b/fs/ceph/addr.c
  @@ -76,7 +76,7 @@ static int ceph_set_page_dirty(struct page *page)
  if (unlikely(!mapping))
  return !TestSetPageDirty(page);
 
  -   if (TestSetPageDirty(page)) {
  +   if (!__set_page_dirty_nobuffers(page)) {
 it's too early to set the radix tree tag here. We should set page's snapshot
 context and increase the i_wrbuffer_ref first. This is because once the tag
 is set, writeback thread can find and start flushing the page.

 Unfortunately I only remember being frustrated by this code.  :)  Looking
 at it now, though, it seems like the minimum fix is to set the
 page-private before marking the page dirty.  I don't know the locking
 rules around that, though.  If that is potentially racy, maybe the safest
 thing would be if __set_page_dirty_nobuffers() took a void* to set
 page-private to atomically while holding the tree_lock.


Sorry, I don't catch the point of your last sentence... Could you
please explain it again?

I notice there is a check in __set_page_dirty_nobuffers():
  WARN_ON_ONCE(!PagePrivate(page)  !PageUptodate(page));
So does it mean we can only set page-private after it? but if so the
__mark_inode_dirty is still ahead of setting snapc.


Thanks,
Sha
--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


ocf script for ceph quorum check and fs

2013-08-02 Thread James Harper
I want to mount ceph fs (using fuse) but /etc/fstab treats it as a local 
filesystem and so tries to mount it before ceph is started, or indeed before 
the network is even up.

Also, ceph tries to start before the network is up and fails because it can't 
bind to an address. I think this is because I'm using openvswitch which is 
starting later.

Anyway, controlling it all via pacemaker seems like a much smarter thing to do 
as I can ensure that ceph doesn't start until the network is up (which it will 
be when pacemaker has quorum), and then the fs doesn't mount until ceph is 
started and the ceph cluster has quorum and maybe until an mds is running.

So to do this I need ocf scripts for:

. starting components of ceph, which appears to be provided by 
ceph-resource-agents.deb, but I haven't tested yet
. checking ceph quorum - I guess I'd need to write a script for this like 
existing ping check
. checking mds is available - maybe part of the mount script
. mount using ceph-fuse. Existing fs scripts make assumptions about fs types 
and parameters and don't work

Has anyone written a 'check_ceph_quorum' script? Or a fs mount script that is 
compatible with fuse? I don't want to re-invent any wheels!

Thanks

James

--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


rados_clone_range for different pgs

2013-08-02 Thread Oleg Krasnianskiy
Hi

I have asked this question in ceph-users, but did not get any
response, so I'll test my luck again, but with ceph-devel =)

Is there any way to copy part of one object into another one if they
reside in different pgs?
There is rados_clone_range, but it requires both objects to be inside one pg.

Thanks!
--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH V5 2/8] fs/ceph: vfs __set_page_dirty_nobuffers interface instead of doing it inside filesystem

2013-08-02 Thread Yan, Zheng
On Fri, Aug 2, 2013 at 5:04 PM, Sha Zhengju handai@gmail.com wrote:

 On Thu, Aug 1, 2013 at 11:19 PM, Yan, Zheng uker...@gmail.com wrote:
  On Thu, Aug 1, 2013 at 7:51 PM, Sha Zhengju handai@gmail.com wrote:
  From: Sha Zhengju handai@taobao.com
 
  Following we will begin to add memcg dirty page accounting around
  __set_page_dirty_
  {buffers,nobuffers} in vfs layer, so we'd better use vfs interface to
  avoid exporting
  those details to filesystems.
 
  Signed-off-by: Sha Zhengju handai@taobao.com
  ---
   fs/ceph/addr.c |   13 +
   1 file changed, 1 insertion(+), 12 deletions(-)
 
  diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
  index 3e68ac1..1445bf1 100644
  --- a/fs/ceph/addr.c
  +++ b/fs/ceph/addr.c
  @@ -76,7 +76,7 @@ static int ceph_set_page_dirty(struct page *page)
  if (unlikely(!mapping))
  return !TestSetPageDirty(page);
 
  -   if (TestSetPageDirty(page)) {
  +   if (!__set_page_dirty_nobuffers(page)) {
 
  it's too early to set the radix tree tag here. We should set page's snapshot
  context and increase the i_wrbuffer_ref first. This is because once the tag
  is set, writeback thread can find and start flushing the page.

 OK, thanks for pointing it out.

 
 
  dout(%p set_page_dirty %p idx %lu -- already dirty\n,
   mapping-host, page, page-index);
  return 0;
  @@ -107,14 +107,7 @@ static int ceph_set_page_dirty(struct page *page)
   snapc, snapc-seq, snapc-num_snaps);
  spin_unlock(ci-i_ceph_lock);
 
  -   /* now adjust page */
  -   spin_lock_irq(mapping-tree_lock);
  if (page-mapping) {/* Race with truncate? */
  -   WARN_ON_ONCE(!PageUptodate(page));
  -   account_page_dirtied(page, page-mapping);
  -   radix_tree_tag_set(mapping-page_tree,
  -   page_index(page), PAGECACHE_TAG_DIRTY);
  -
 
  this code was coped from __set_page_dirty_nobuffers(). I think the reason
  Sage did this is to handle the race described in
  __set_page_dirty_nobuffers()'s comment. But I'm wonder if page-mapping ==
  NULL can still happen here. Because truncate_inode_page() unmap page from
  processes's address spaces first, then delete page from page cache.

 But in non-mmap case, doesn't it has no relation to 'unmap page from
 address spaces'?

In non-mmap case, page is locked when the set_page_dirty() callback is called.
truncate_inode_page() waits until the page is unlocked, then delete it from the
page cache.

Regards
Yan, Zheng

 The check is exactly avoiding racy with delete_from_page_cache(),
 since the two both need to hold mapping-tree_lock, and if truncate
 goes first then __set_page_dirty_nobuffers() may have NULL mapping.


 Thanks,
 Sha
--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: rados_clone_range for different pgs

2013-08-02 Thread Sage Weil
Hi Oleg,

On Fri, 2 Aug 2013, Oleg Krasnianskiy wrote:
 Hi
 
 I have asked this question in ceph-users, but did not get any
 response, so I'll test my luck again, but with ceph-devel =)

Sorry about that!
 
 Is there any way to copy part of one object into another one if they
 reside in different pgs?
 There is rados_clone_range, but it requires both objects to be inside one pg.

There is no way currently.  The clone_range can only (reliably) work on an 
OSD if it is stored with the same locator key; otherwise you have a ~R/N 
chance of that happening (where N is the number of OSDs, R is the number 
of replicas), which isn't worth optimizing for.  If the objects aren't 
stored together, you need to read and then write the data; this avoids 
adding additional complexity to the OSD for minimal gain.

Do you have a use-case in mind where this functionality is important?

sage

--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: PG Backend Proposal

2013-08-02 Thread Loic Dachary
Hi Sam,

 - coll_t needs to include a chunk_id_t.
https://github.com/athanatos/ceph/blob/2234bdf7fc30738363160d598ae8b4d6f75e1dd1/doc/dev/osd_internals/erasure_coding.rst#distinguished-acting-set-positions

That would be for sanity check ? Since the rank of the chunk ( chunk_id_t ) 
matches the position in the acting set and a history of osdmaps is kept, would 
this be used when loading the pg from disk to make sure it matches the expected 
chunk_id_t ?

Cheers

On 02/08/2013 09:39, Loic Dachary wrote:
 Hi Sam,
 
 I think I understand and paraphrasing you to make sure I do. We may save 
 bandwidth because chunks are not moved as much if their position is not tied 
 to the position of the OSD containing them in the acting set. But this is 
 mitigated by the use of the indep crush mode. And it may require to handle 
 tricky edge cases. In addition, you think that being able to know which OSD 
 contains which chunk by using only the OSDMap and the (v)hobject_t is going 
 to simplify the design.
 
 For the record:
 
 Back in April Sage suggested that
 
 - those PGs use the parity ('INDEP') crush mode so that placement is 
 intelligent
 
 http://permalink.gmane.org/gmane.comp.file-systems.ceph.devel/14579
 
 The indep placement avoids moving around a shard between ranks, because a 
 mapping of [0,1,2,3,4] will change to [0,6,2,3,4] (or something) if osd.1 
 fails and the shards on 2,3,4 won't need to be copied around.
 
 http://permalink.gmane.org/gmane.comp.file-systems.ceph.devel/14582
 
 and I assume that's what you refer to when you write CRUSH has a mode which 
 will cause replacement to behave well for erasure codes:
 
  initial: [0,1,2]
  0 fails: [3,1,2]
  2 fails: [3,1,4]
  0 recovers: [0,1,4]
 
 I understand this is implemented here:
 
 https://github.com/ceph/ceph/blob/e7d47827f0333c96ad43d257607fb92ed4176550/src/crush/mapper.c#L523
 
 and will determine to order of the acting set 
 
 https://github.com/ceph/ceph/blob/e7d47827f0333c96ad43d257607fb92ed4176550/src/osd/OSDMap.cc#L998
 
 when called by the monitor when creating or updating a PG
 
 https://github.com/ceph/ceph/blob/e7d47827f0333c96ad43d257607fb92ed4176550/src/mon/PGMonitor.cc#L814
 
 Cheers
 
 On 02/08/2013 03:34, Samuel Just wrote:
 I think there are some tricky edge cases with the above approach.  You
 might end up with two pg replicas in the same acting set which happen
 for reasons of history to have the same chunk for one or more objects.
  That would have to be detected and repaired even though the object
 would be missing from neither replica (and might not even be in the pg
 log).  The erasure_code_rank would have to be somehow maintained
 through recovery (do we remember the original holder of a particular
 chunk in case it ever comes back?).

 The chunk rank doesn't *need* to match the acting set position, but
 there are some good reasons to arrange for that to be the case:
 1) Otherwise, we need something else to assign the chunk ranks
 2) This way, a new primary can determine which osds hold which
 replicas of which chunk rank by looking at past osd maps.

 It seems to me that given an OSDMap and an object, we should know
 immediately where all chunks should be stored since a future primary
 may need to do that without access to the objects themselves.

 Importantly, while it may be possible for an acting set transition
 like [0,1,2]-[2,1,0] to occur in some pathological case, CRUSH has a
 mode which will cause replacement to behave well for erasure codes:

 initial: [0,1,2]
 0 fails: [3,1,2]
 2 fails: [3,1,4]
 0 recovers: [0,1,4]

 We do, however, need to decouple primariness from position in the
 acting set so that backfill can work well.
 -Sam

 On Thu, Aug 1, 2013 at 4:54 PM, Loic Dachary l...@dachary.org wrote:
 Hi Sam,

 I'm under the impression that
 https://github.com/athanatos/ceph/blob/wip-erasure-coding-doc/doc/dev/osd_internals/erasure_coding.rst#distinguished-acting-set-positions
 assumes acting[0] stores all chunk[0], acting[1] stores all chunk[1] etc.

 The chunk rank does not need to match the OSD position in the acting set. 
 As long as each object chunk is stored with its rank in an attribute, 
 changing the order of the acting set does not require to move the chunks 
 around.

 With M=2+K=1 and the acting set is [0,1,2] chunks M0,M1,K0 are written on 
 [0,1,2] respectively, each of them have the 'erasure_code_rank' attribute 
 set to their rank.

 If the acting set changes to [2,1,0] the read would reorder the chunk based 
 on their 'erasure_code_rank' attribute instead of the rank of the OSD they 
 originate from in the current acting set. And then be able to decode them 
 with the erasure code library, which requires that the chunks are provided 
 in a specific order.

 When doing a full write, the chunks are written in the same order as the 
 acting set. This implies that the order of the chunks of the previous 
 version of the object may be different but I don't see a problem with that.

 When doing an 

mds sessions at cds

2013-08-02 Thread Sage Weil
There are two short sessions for MDS blueprints:

  mds: dumpability
  mds: reduce memory usage

These are both reasonably self-contained projects that are easy for people 
to get involved in but have relatively high payoff in terms of MDS 
performance and debuggability.  If anyone is interested in getting 
involved in CephFS/MDS development, please add your name to the interested 
parties and join the CDS session hangout on Monday afternoon!

sage
--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: PG Backend Proposal

2013-08-02 Thread Samuel Just
The reason for the chunk_id_t in the coll_t is to handle a tricky edge case:
[0,1,2]
[3,1,2]
..time passes..
[3,0,2]

This should be exceedingly rare, but a single osd might end up with
copies of two different chunks of the same pg.

When an osd joins an acting set with a preexisting copy of the pg and
can be brought up to date with logs, we must know which chunk each
object in the replica is without having to scan the PG (more to the
point, we need to know that each chunk matches the chunk of the osd
which it replaced).  If a pg replica can store any chunk, we need a
mechanism to ensure that.  It seems simpler to force all objects
within a replica to be the same chunk and furthermore to tie that
chunk to the position in the acting set.
-Sam

On Fri, Aug 2, 2013 at 8:10 AM, Loic Dachary l...@dachary.org wrote:
 Hi Sam,

 - coll_t needs to include a chunk_id_t.
 https://github.com/athanatos/ceph/blob/2234bdf7fc30738363160d598ae8b4d6f75e1dd1/doc/dev/osd_internals/erasure_coding.rst#distinguished-acting-set-positions

 That would be for sanity check ? Since the rank of the chunk ( chunk_id_t ) 
 matches the position in the acting set and a history of osdmaps is kept, 
 would this be used when loading the pg from disk to make sure it matches the 
 expected chunk_id_t ?

 Cheers

 On 02/08/2013 09:39, Loic Dachary wrote:
 Hi Sam,

 I think I understand and paraphrasing you to make sure I do. We may save 
 bandwidth because chunks are not moved as much if their position is not tied 
 to the position of the OSD containing them in the acting set. But this is 
 mitigated by the use of the indep crush mode. And it may require to handle 
 tricky edge cases. In addition, you think that being able to know which OSD 
 contains which chunk by using only the OSDMap and the (v)hobject_t is going 
 to simplify the design.

 For the record:

 Back in April Sage suggested that

 - those PGs use the parity ('INDEP') crush mode so that placement is 
 intelligent

 http://permalink.gmane.org/gmane.comp.file-systems.ceph.devel/14579

 The indep placement avoids moving around a shard between ranks, because a 
 mapping of [0,1,2,3,4] will change to [0,6,2,3,4] (or something) if osd.1 
 fails and the shards on 2,3,4 won't need to be copied around.

 http://permalink.gmane.org/gmane.comp.file-systems.ceph.devel/14582

 and I assume that's what you refer to when you write CRUSH has a mode which 
 will cause replacement to behave well for erasure codes:

  initial: [0,1,2]
  0 fails: [3,1,2]
  2 fails: [3,1,4]
  0 recovers: [0,1,4]

 I understand this is implemented here:

 https://github.com/ceph/ceph/blob/e7d47827f0333c96ad43d257607fb92ed4176550/src/crush/mapper.c#L523

 and will determine to order of the acting set

 https://github.com/ceph/ceph/blob/e7d47827f0333c96ad43d257607fb92ed4176550/src/osd/OSDMap.cc#L998

 when called by the monitor when creating or updating a PG

 https://github.com/ceph/ceph/blob/e7d47827f0333c96ad43d257607fb92ed4176550/src/mon/PGMonitor.cc#L814

 Cheers

 On 02/08/2013 03:34, Samuel Just wrote:
 I think there are some tricky edge cases with the above approach.  You
 might end up with two pg replicas in the same acting set which happen
 for reasons of history to have the same chunk for one or more objects.
  That would have to be detected and repaired even though the object
 would be missing from neither replica (and might not even be in the pg
 log).  The erasure_code_rank would have to be somehow maintained
 through recovery (do we remember the original holder of a particular
 chunk in case it ever comes back?).

 The chunk rank doesn't *need* to match the acting set position, but
 there are some good reasons to arrange for that to be the case:
 1) Otherwise, we need something else to assign the chunk ranks
 2) This way, a new primary can determine which osds hold which
 replicas of which chunk rank by looking at past osd maps.

 It seems to me that given an OSDMap and an object, we should know
 immediately where all chunks should be stored since a future primary
 may need to do that without access to the objects themselves.

 Importantly, while it may be possible for an acting set transition
 like [0,1,2]-[2,1,0] to occur in some pathological case, CRUSH has a
 mode which will cause replacement to behave well for erasure codes:

 initial: [0,1,2]
 0 fails: [3,1,2]
 2 fails: [3,1,4]
 0 recovers: [0,1,4]

 We do, however, need to decouple primariness from position in the
 acting set so that backfill can work well.
 -Sam

 On Thu, Aug 1, 2013 at 4:54 PM, Loic Dachary l...@dachary.org wrote:
 Hi Sam,

 I'm under the impression that
 https://github.com/athanatos/ceph/blob/wip-erasure-coding-doc/doc/dev/osd_internals/erasure_coding.rst#distinguished-acting-set-positions
 assumes acting[0] stores all chunk[0], acting[1] stores all chunk[1] etc.

 The chunk rank does not need to match the OSD position in the acting set. 
 As long as each object chunk is stored with its rank in an attribute, 
 changing the 

Re: still recovery issues with cuttlefish

2013-08-02 Thread Samuel Just
You might try turning osd_max_backfills to 2 or 1.
-Sam

On Fri, Aug 2, 2013 at 12:44 AM, Stefan Priebe s.pri...@profihost.ag wrote:
 Am 01.08.2013 23:23, schrieb Samuel Just: Can you dump your osd settings?

 sudo ceph --admin-daemon ceph-osd.osdid.asok config show

 Sure.



 { name: osd.0,
   cluster: ceph,
   none: 0\/5,
   lockdep: 0\/0,
   context: 0\/0,
   crush: 0\/0,
   mds: 0\/0,
   mds_balancer: 0\/0,
   mds_locker: 0\/0,
   mds_log: 0\/0,
   mds_log_expire: 0\/0,
   mds_migrator: 0\/0,
   buffer: 0\/0,
   timer: 0\/0,
   filer: 0\/0,
   striper: 0\/1,
   objecter: 0\/0,
   rados: 0\/0,
   rbd: 0\/0,
   journaler: 0\/0,
   objectcacher: 0\/0,
   client: 0\/0,
   osd: 0\/0,
   optracker: 0\/0,
   objclass: 0\/0,
   filestore: 0\/0,
   journal: 0\/0,
   ms: 0\/0,
   mon: 0\/0,
   monc: 0\/0,
   paxos: 0\/0,
   tp: 0\/0,
   auth: 0\/0,
   crypto: 1\/5,
   finisher: 0\/0,
   heartbeatmap: 0\/0,
   perfcounter: 0\/0,
   rgw: 0\/0,
   hadoop: 0\/0,
   javaclient: 1\/5,
   asok: 0\/0,
   throttle: 0\/0,
   host: cloud1-1268,
   fsid: ----,
   public_addr: 10.255.0.90:0\/0,
   cluster_addr: 10.255.0.90:0\/0,
   public_network: 10.255.0.1\/24,
   cluster_network: 10.255.0.1\/24,
   num_client: 1,
   monmap: ,
   mon_host: ,
   lockdep: false,
   run_dir: \/var\/run\/ceph,
   admin_socket: \/var\/run\/ceph\/ceph-osd.0.asok,
   daemonize: true,
   pid_file: \/var\/run\/ceph\/osd.0.pid,
   chdir: \/,
   max_open_files: 0,
   fatal_signal_handlers: true,
   log_file: \/var\/log\/ceph\/ceph-osd.0.log,
   log_max_new: 1000,
   log_max_recent: 1,
   log_to_stderr: false,
   err_to_stderr: true,
   log_to_syslog: false,
   err_to_syslog: false,
   log_flush_on_exit: true,
   log_stop_at_utilization: 0.97,
   clog_to_monitors: true,
   clog_to_syslog: false,
   clog_to_syslog_level: info,
   clog_to_syslog_facility: daemon,
   mon_cluster_log_to_syslog: false,
   mon_cluster_log_to_syslog_level: info,
   mon_cluster_log_to_syslog_facility: daemon,
   mon_cluster_log_file: \/var\/log\/ceph\/ceph.log,
   key: ,
   keyfile: ,
   keyring: \/etc\/ceph\/osd.0.keyring,
   heartbeat_interval: 5,
   heartbeat_file: ,
   heartbeat_inject_failure: 0,
   perf: true,
   ms_tcp_nodelay: true,
   ms_tcp_rcvbuf: 0,
   ms_initial_backoff: 0.2,
   ms_max_backoff: 15,
   ms_nocrc: false,
   ms_die_on_bad_msg: false,
   ms_die_on_unhandled_msg: false,
   ms_dispatch_throttle_bytes: 104857600,
   ms_bind_ipv6: false,
   ms_bind_port_min: 6800,
   ms_bind_port_max: 7100,
   ms_rwthread_stack_bytes: 1048576,
   ms_tcp_read_timeout: 900,
   ms_pq_max_tokens_per_priority: 4194304,
   ms_pq_min_cost: 65536,
   ms_inject_socket_failures: 0,
   ms_inject_delay_type: ,
   ms_inject_delay_max: 1,
   ms_inject_delay_probability: 0,
   ms_inject_internal_delays: 0,
   mon_data: \/var\/lib\/ceph\/mon\/ceph-0,
   mon_initial_members: ,
   mon_sync_fs_threshold: 5,
   mon_compact_on_start: false,
   mon_compact_on_bootstrap: false,
   mon_compact_on_trim: true,
   mon_tick_interval: 5,
   mon_subscribe_interval: 300,
   mon_osd_laggy_halflife: 3600,
   mon_osd_laggy_weight: 0.3,
   mon_osd_adjust_heartbeat_grace: true,
   mon_osd_adjust_down_out_interval: true,
   mon_osd_auto_mark_in: false,
   mon_osd_auto_mark_auto_out_in: true,
   mon_osd_auto_mark_new_in: true,
   mon_osd_down_out_interval: 300,
   mon_osd_down_out_subtree_limit: rack,
   mon_osd_min_up_ratio: 0.3,
   mon_osd_min_in_ratio: 0.3,
   mon_stat_smooth_intervals: 2,
   mon_lease: 5,
   mon_lease_renew_interval: 3,
   mon_lease_ack_timeout: 10,
   mon_clock_drift_allowed: 0.05,
   mon_clock_drift_warn_backoff: 5,
   mon_timecheck_interval: 300,
   mon_accept_timeout: 10,
   mon_pg_create_interval: 30,
   mon_pg_stuck_threshold: 300,
   mon_osd_full_ratio: 0.95,
   mon_osd_nearfull_ratio: 0.85,
   mon_globalid_prealloc: 100,
   mon_osd_report_timeout: 900,
   mon_force_standby_active: true,
   mon_min_osdmap_epochs: 500,
   mon_max_pgmap_epochs: 500,
   mon_max_log_epochs: 500,
   mon_max_osd: 1,
   mon_probe_timeout: 2,
   mon_slurp_timeout: 10,
   mon_slurp_bytes: 262144,
   mon_client_bytes: 104857600,
   mon_daemon_bytes: 419430400,
   mon_max_log_entries_per_event: 4096,
   mon_health_data_update_interval: 60,
   mon_data_avail_crit: 5,
   mon_data_avail_warn: 30,
   mon_config_key_max_entry_size: 4096,
   mon_sync_trim_timeout: 30,
   mon_sync_heartbeat_timeout: 30,
   mon_sync_heartbeat_interval: 5,
   mon_sync_backoff_timeout: 30,
   mon_sync_timeout: 30,
   mon_sync_max_retries: 5,
   mon_sync_max_payload_size: 1048576,
   mon_sync_debug: false,
   mon_sync_debug_leader: -1,
   mon_sync_debug_provider: -1,
   mon_sync_debug_provider_fallback: -1,
   mon_debug_dump_transactions: false,
   mon_debug_dump_location: \/var\/log\/ceph\/ceph-osd.0.tdump,
   mon_sync_leader_kill_at: 0,
   mon_sync_provider_kill_at: 0,
   mon_sync_requester_kill_at: 0,
   mon_leveldb_write_buffer_size: 33554432,
   mon_leveldb_cache_size: 268435456,
   

Re: still recovery issues with cuttlefish

2013-08-02 Thread Andrey Korolyov
Created #5844.

On Thu, Aug 1, 2013 at 10:38 PM, Samuel Just sam.j...@inktank.com wrote:
 Is there a bug open for this?  I suspect we don't sufficiently
 throttle the snapshot removal work.
 -Sam

 On Thu, Aug 1, 2013 at 7:50 AM, Andrey Korolyov and...@xdel.ru wrote:
 Second this. Also for long-lasting snapshot problem and related
 performance issues I may say that cuttlefish improved things greatly,
 but creation/deletion of large snapshot (hundreds of gigabytes of
 commited data) still can bring down cluster for a minutes, despite
 usage of every possible optimization.

 On Thu, Aug 1, 2013 at 12:22 PM, Stefan Priebe - Profihost AG
 s.pri...@profihost.ag wrote:
 Hi,

 i still have recovery issues with cuttlefish. After the OSD comes back
 it seem to hang for around 2-4 minutes and then recovery seems to start
 (pgs in recovery_wait start to decrement). This is with ceph 0.61.7. I
 get a lot of slow request messages an hanging VMs.

 What i noticed today is that if i leave the OSD off as long as ceph
 starts to backfill - the recovery and re backfilling wents absolutely
 smooth without any issues and no slow request messages at all.

 Does anybody have an idea why?

 Greets,
 Stefan
 --
 To unsubscribe from this list: send the line unsubscribe ceph-devel in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 --
 To unsubscribe from this list: send the line unsubscribe ceph-devel in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


ceph-deploy progress and CDS session

2013-08-02 Thread Sage Weil
There is a session at CDS scheduled to discuss ceph-deploy (4:40pm PDT on 
Monday).  We'll be going over what we currently have in backlog for 
improvements, but if you have any opinions about what else ceph-deploy 
should or should not do or areas where it is problematic, please reply to 
this thread to let us know what you think, and/or join the CDS discussion 
hangout.

For those who haven't noticed, we now have a full-time devoloper,
Alfredo Deza, who is working on ceph-deploy.  He's been making huge
progress over the last couple of weeks improving error reporting,
visibility into what ceph-deploy is doing, and fixing various bugs.  We
have a long list of things we want to do with the tool, but any feedback
from users is helpful to make sure we're working on the right things 
first!

sage


--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: still recovery issues with cuttlefish

2013-08-02 Thread Stefan Priebe
I already tried both values this makes no difference. The drives are not 
the bottleneck.


Am 02.08.2013 19:35, schrieb Samuel Just:

You might try turning osd_max_backfills to 2 or 1.
-Sam

On Fri, Aug 2, 2013 at 12:44 AM, Stefan Priebe s.pri...@profihost.ag wrote:

Am 01.08.2013 23:23, schrieb Samuel Just: Can you dump your osd settings?


sudo ceph --admin-daemon ceph-osd.osdid.asok config show


Sure.



{ name: osd.0,
   cluster: ceph,
   none: 0\/5,
   lockdep: 0\/0,
   context: 0\/0,
   crush: 0\/0,
   mds: 0\/0,
   mds_balancer: 0\/0,
   mds_locker: 0\/0,
   mds_log: 0\/0,
   mds_log_expire: 0\/0,
   mds_migrator: 0\/0,
   buffer: 0\/0,
   timer: 0\/0,
   filer: 0\/0,
   striper: 0\/1,
   objecter: 0\/0,
   rados: 0\/0,
   rbd: 0\/0,
   journaler: 0\/0,
   objectcacher: 0\/0,
   client: 0\/0,
   osd: 0\/0,
   optracker: 0\/0,
   objclass: 0\/0,
   filestore: 0\/0,
   journal: 0\/0,
   ms: 0\/0,
   mon: 0\/0,
   monc: 0\/0,
   paxos: 0\/0,
   tp: 0\/0,
   auth: 0\/0,
   crypto: 1\/5,
   finisher: 0\/0,
   heartbeatmap: 0\/0,
   perfcounter: 0\/0,
   rgw: 0\/0,
   hadoop: 0\/0,
   javaclient: 1\/5,
   asok: 0\/0,
   throttle: 0\/0,
   host: cloud1-1268,
   fsid: ----,
   public_addr: 10.255.0.90:0\/0,
   cluster_addr: 10.255.0.90:0\/0,
   public_network: 10.255.0.1\/24,
   cluster_network: 10.255.0.1\/24,
   num_client: 1,
   monmap: ,
   mon_host: ,
   lockdep: false,
   run_dir: \/var\/run\/ceph,
   admin_socket: \/var\/run\/ceph\/ceph-osd.0.asok,
   daemonize: true,
   pid_file: \/var\/run\/ceph\/osd.0.pid,
   chdir: \/,
   max_open_files: 0,
   fatal_signal_handlers: true,
   log_file: \/var\/log\/ceph\/ceph-osd.0.log,
   log_max_new: 1000,
   log_max_recent: 1,
   log_to_stderr: false,
   err_to_stderr: true,
   log_to_syslog: false,
   err_to_syslog: false,
   log_flush_on_exit: true,
   log_stop_at_utilization: 0.97,
   clog_to_monitors: true,
   clog_to_syslog: false,
   clog_to_syslog_level: info,
   clog_to_syslog_facility: daemon,
   mon_cluster_log_to_syslog: false,
   mon_cluster_log_to_syslog_level: info,
   mon_cluster_log_to_syslog_facility: daemon,
   mon_cluster_log_file: \/var\/log\/ceph\/ceph.log,
   key: ,
   keyfile: ,
   keyring: \/etc\/ceph\/osd.0.keyring,
   heartbeat_interval: 5,
   heartbeat_file: ,
   heartbeat_inject_failure: 0,
   perf: true,
   ms_tcp_nodelay: true,
   ms_tcp_rcvbuf: 0,
   ms_initial_backoff: 0.2,
   ms_max_backoff: 15,
   ms_nocrc: false,
   ms_die_on_bad_msg: false,
   ms_die_on_unhandled_msg: false,
   ms_dispatch_throttle_bytes: 104857600,
   ms_bind_ipv6: false,
   ms_bind_port_min: 6800,
   ms_bind_port_max: 7100,
   ms_rwthread_stack_bytes: 1048576,
   ms_tcp_read_timeout: 900,
   ms_pq_max_tokens_per_priority: 4194304,
   ms_pq_min_cost: 65536,
   ms_inject_socket_failures: 0,
   ms_inject_delay_type: ,
   ms_inject_delay_max: 1,
   ms_inject_delay_probability: 0,
   ms_inject_internal_delays: 0,
   mon_data: \/var\/lib\/ceph\/mon\/ceph-0,
   mon_initial_members: ,
   mon_sync_fs_threshold: 5,
   mon_compact_on_start: false,
   mon_compact_on_bootstrap: false,
   mon_compact_on_trim: true,
   mon_tick_interval: 5,
   mon_subscribe_interval: 300,
   mon_osd_laggy_halflife: 3600,
   mon_osd_laggy_weight: 0.3,
   mon_osd_adjust_heartbeat_grace: true,
   mon_osd_adjust_down_out_interval: true,
   mon_osd_auto_mark_in: false,
   mon_osd_auto_mark_auto_out_in: true,
   mon_osd_auto_mark_new_in: true,
   mon_osd_down_out_interval: 300,
   mon_osd_down_out_subtree_limit: rack,
   mon_osd_min_up_ratio: 0.3,
   mon_osd_min_in_ratio: 0.3,
   mon_stat_smooth_intervals: 2,
   mon_lease: 5,
   mon_lease_renew_interval: 3,
   mon_lease_ack_timeout: 10,
   mon_clock_drift_allowed: 0.05,
   mon_clock_drift_warn_backoff: 5,
   mon_timecheck_interval: 300,
   mon_accept_timeout: 10,
   mon_pg_create_interval: 30,
   mon_pg_stuck_threshold: 300,
   mon_osd_full_ratio: 0.95,
   mon_osd_nearfull_ratio: 0.85,
   mon_globalid_prealloc: 100,
   mon_osd_report_timeout: 900,
   mon_force_standby_active: true,
   mon_min_osdmap_epochs: 500,
   mon_max_pgmap_epochs: 500,
   mon_max_log_epochs: 500,
   mon_max_osd: 1,
   mon_probe_timeout: 2,
   mon_slurp_timeout: 10,
   mon_slurp_bytes: 262144,
   mon_client_bytes: 104857600,
   mon_daemon_bytes: 419430400,
   mon_max_log_entries_per_event: 4096,
   mon_health_data_update_interval: 60,
   mon_data_avail_crit: 5,
   mon_data_avail_warn: 30,
   mon_config_key_max_entry_size: 4096,
   mon_sync_trim_timeout: 30,
   mon_sync_heartbeat_timeout: 30,
   mon_sync_heartbeat_interval: 5,
   mon_sync_backoff_timeout: 30,
   mon_sync_timeout: 30,
   mon_sync_max_retries: 5,
   mon_sync_max_payload_size: 1048576,
   mon_sync_debug: false,
   mon_sync_debug_leader: -1,
   mon_sync_debug_provider: -1,
   mon_sync_debug_provider_fallback: -1,
   mon_debug_dump_transactions: false,
   mon_debug_dump_location: \/var\/log\/ceph\/ceph-osd.0.tdump,
   mon_sync_leader_kill_at: 0,
   

Re: still recovery issues with cuttlefish

2013-08-02 Thread Samuel Just
Also, you have osd_recovery_op_priority at 50.  That is close to the
priority of client IO.  You want it below 10 (defaults to 10), perhaps
at 1.  You can also adjust down osd_recovery_max_active.
-Sam

On Fri, Aug 2, 2013 at 11:16 AM, Stefan Priebe s.pri...@profihost.ag wrote:
 I already tried both values this makes no difference. The drives are not the
 bottleneck.

 Am 02.08.2013 19:35, schrieb Samuel Just:

 You might try turning osd_max_backfills to 2 or 1.
 -Sam

 On Fri, Aug 2, 2013 at 12:44 AM, Stefan Priebe s.pri...@profihost.ag
 wrote:

 Am 01.08.2013 23:23, schrieb Samuel Just: Can you dump your osd
 settings?

 sudo ceph --admin-daemon ceph-osd.osdid.asok config show


 Sure.



 { name: osd.0,
cluster: ceph,
none: 0\/5,
lockdep: 0\/0,
context: 0\/0,
crush: 0\/0,
mds: 0\/0,
mds_balancer: 0\/0,
mds_locker: 0\/0,
mds_log: 0\/0,
mds_log_expire: 0\/0,
mds_migrator: 0\/0,
buffer: 0\/0,
timer: 0\/0,
filer: 0\/0,
striper: 0\/1,
objecter: 0\/0,
rados: 0\/0,
rbd: 0\/0,
journaler: 0\/0,
objectcacher: 0\/0,
client: 0\/0,
osd: 0\/0,
optracker: 0\/0,
objclass: 0\/0,
filestore: 0\/0,
journal: 0\/0,
ms: 0\/0,
mon: 0\/0,
monc: 0\/0,
paxos: 0\/0,
tp: 0\/0,
auth: 0\/0,
crypto: 1\/5,
finisher: 0\/0,
heartbeatmap: 0\/0,
perfcounter: 0\/0,
rgw: 0\/0,
hadoop: 0\/0,
javaclient: 1\/5,
asok: 0\/0,
throttle: 0\/0,
host: cloud1-1268,
fsid: ----,
public_addr: 10.255.0.90:0\/0,
cluster_addr: 10.255.0.90:0\/0,
public_network: 10.255.0.1\/24,
cluster_network: 10.255.0.1\/24,
num_client: 1,
monmap: ,
mon_host: ,
lockdep: false,
run_dir: \/var\/run\/ceph,
admin_socket: \/var\/run\/ceph\/ceph-osd.0.asok,
daemonize: true,
pid_file: \/var\/run\/ceph\/osd.0.pid,
chdir: \/,
max_open_files: 0,
fatal_signal_handlers: true,
log_file: \/var\/log\/ceph\/ceph-osd.0.log,
log_max_new: 1000,
log_max_recent: 1,
log_to_stderr: false,
err_to_stderr: true,
log_to_syslog: false,
err_to_syslog: false,
log_flush_on_exit: true,
log_stop_at_utilization: 0.97,
clog_to_monitors: true,
clog_to_syslog: false,
clog_to_syslog_level: info,
clog_to_syslog_facility: daemon,
mon_cluster_log_to_syslog: false,
mon_cluster_log_to_syslog_level: info,
mon_cluster_log_to_syslog_facility: daemon,
mon_cluster_log_file: \/var\/log\/ceph\/ceph.log,
key: ,
keyfile: ,
keyring: \/etc\/ceph\/osd.0.keyring,
heartbeat_interval: 5,
heartbeat_file: ,
heartbeat_inject_failure: 0,
perf: true,
ms_tcp_nodelay: true,
ms_tcp_rcvbuf: 0,
ms_initial_backoff: 0.2,
ms_max_backoff: 15,
ms_nocrc: false,
ms_die_on_bad_msg: false,
ms_die_on_unhandled_msg: false,
ms_dispatch_throttle_bytes: 104857600,
ms_bind_ipv6: false,
ms_bind_port_min: 6800,
ms_bind_port_max: 7100,
ms_rwthread_stack_bytes: 1048576,
ms_tcp_read_timeout: 900,
ms_pq_max_tokens_per_priority: 4194304,
ms_pq_min_cost: 65536,
ms_inject_socket_failures: 0,
ms_inject_delay_type: ,
ms_inject_delay_max: 1,
ms_inject_delay_probability: 0,
ms_inject_internal_delays: 0,
mon_data: \/var\/lib\/ceph\/mon\/ceph-0,
mon_initial_members: ,
mon_sync_fs_threshold: 5,
mon_compact_on_start: false,
mon_compact_on_bootstrap: false,
mon_compact_on_trim: true,
mon_tick_interval: 5,
mon_subscribe_interval: 300,
mon_osd_laggy_halflife: 3600,
mon_osd_laggy_weight: 0.3,
mon_osd_adjust_heartbeat_grace: true,
mon_osd_adjust_down_out_interval: true,
mon_osd_auto_mark_in: false,
mon_osd_auto_mark_auto_out_in: true,
mon_osd_auto_mark_new_in: true,
mon_osd_down_out_interval: 300,
mon_osd_down_out_subtree_limit: rack,
mon_osd_min_up_ratio: 0.3,
mon_osd_min_in_ratio: 0.3,
mon_stat_smooth_intervals: 2,
mon_lease: 5,
mon_lease_renew_interval: 3,
mon_lease_ack_timeout: 10,
mon_clock_drift_allowed: 0.05,
mon_clock_drift_warn_backoff: 5,
mon_timecheck_interval: 300,
mon_accept_timeout: 10,
mon_pg_create_interval: 30,
mon_pg_stuck_threshold: 300,
mon_osd_full_ratio: 0.95,
mon_osd_nearfull_ratio: 0.85,
mon_globalid_prealloc: 100,
mon_osd_report_timeout: 900,
mon_force_standby_active: true,
mon_min_osdmap_epochs: 500,
mon_max_pgmap_epochs: 500,
mon_max_log_epochs: 500,
mon_max_osd: 1,
mon_probe_timeout: 2,
mon_slurp_timeout: 10,
mon_slurp_bytes: 262144,
mon_client_bytes: 104857600,
mon_daemon_bytes: 419430400,
mon_max_log_entries_per_event: 4096,
mon_health_data_update_interval: 60,
mon_data_avail_crit: 5,
mon_data_avail_warn: 30,
mon_config_key_max_entry_size: 4096,
mon_sync_trim_timeout: 30,
mon_sync_heartbeat_timeout: 30,

Re: still recovery issues with cuttlefish

2013-08-02 Thread Stefan Priebe

Hi,

osd recovery max active = 1
osd max backfills = 1
osd recovery op priority = 5

still no difference...

Stefan

Am 02.08.2013 20:21, schrieb Samuel Just:

Also, you have osd_recovery_op_priority at 50.  That is close to the
priority of client IO.  You want it below 10 (defaults to 10), perhaps
at 1.  You can also adjust down osd_recovery_max_active.
-Sam

On Fri, Aug 2, 2013 at 11:16 AM, Stefan Priebe s.pri...@profihost.ag wrote:

I already tried both values this makes no difference. The drives are not the
bottleneck.

Am 02.08.2013 19:35, schrieb Samuel Just:


You might try turning osd_max_backfills to 2 or 1.
-Sam

On Fri, Aug 2, 2013 at 12:44 AM, Stefan Priebe s.pri...@profihost.ag
wrote:


Am 01.08.2013 23:23, schrieb Samuel Just: Can you dump your osd
settings?


sudo ceph --admin-daemon ceph-osd.osdid.asok config show



Sure.



{ name: osd.0,
cluster: ceph,
none: 0\/5,
lockdep: 0\/0,
context: 0\/0,
crush: 0\/0,
mds: 0\/0,
mds_balancer: 0\/0,
mds_locker: 0\/0,
mds_log: 0\/0,
mds_log_expire: 0\/0,
mds_migrator: 0\/0,
buffer: 0\/0,
timer: 0\/0,
filer: 0\/0,
striper: 0\/1,
objecter: 0\/0,
rados: 0\/0,
rbd: 0\/0,
journaler: 0\/0,
objectcacher: 0\/0,
client: 0\/0,
osd: 0\/0,
optracker: 0\/0,
objclass: 0\/0,
filestore: 0\/0,
journal: 0\/0,
ms: 0\/0,
mon: 0\/0,
monc: 0\/0,
paxos: 0\/0,
tp: 0\/0,
auth: 0\/0,
crypto: 1\/5,
finisher: 0\/0,
heartbeatmap: 0\/0,
perfcounter: 0\/0,
rgw: 0\/0,
hadoop: 0\/0,
javaclient: 1\/5,
asok: 0\/0,
throttle: 0\/0,
host: cloud1-1268,
fsid: ----,
public_addr: 10.255.0.90:0\/0,
cluster_addr: 10.255.0.90:0\/0,
public_network: 10.255.0.1\/24,
cluster_network: 10.255.0.1\/24,
num_client: 1,
monmap: ,
mon_host: ,
lockdep: false,
run_dir: \/var\/run\/ceph,
admin_socket: \/var\/run\/ceph\/ceph-osd.0.asok,
daemonize: true,
pid_file: \/var\/run\/ceph\/osd.0.pid,
chdir: \/,
max_open_files: 0,
fatal_signal_handlers: true,
log_file: \/var\/log\/ceph\/ceph-osd.0.log,
log_max_new: 1000,
log_max_recent: 1,
log_to_stderr: false,
err_to_stderr: true,
log_to_syslog: false,
err_to_syslog: false,
log_flush_on_exit: true,
log_stop_at_utilization: 0.97,
clog_to_monitors: true,
clog_to_syslog: false,
clog_to_syslog_level: info,
clog_to_syslog_facility: daemon,
mon_cluster_log_to_syslog: false,
mon_cluster_log_to_syslog_level: info,
mon_cluster_log_to_syslog_facility: daemon,
mon_cluster_log_file: \/var\/log\/ceph\/ceph.log,
key: ,
keyfile: ,
keyring: \/etc\/ceph\/osd.0.keyring,
heartbeat_interval: 5,
heartbeat_file: ,
heartbeat_inject_failure: 0,
perf: true,
ms_tcp_nodelay: true,
ms_tcp_rcvbuf: 0,
ms_initial_backoff: 0.2,
ms_max_backoff: 15,
ms_nocrc: false,
ms_die_on_bad_msg: false,
ms_die_on_unhandled_msg: false,
ms_dispatch_throttle_bytes: 104857600,
ms_bind_ipv6: false,
ms_bind_port_min: 6800,
ms_bind_port_max: 7100,
ms_rwthread_stack_bytes: 1048576,
ms_tcp_read_timeout: 900,
ms_pq_max_tokens_per_priority: 4194304,
ms_pq_min_cost: 65536,
ms_inject_socket_failures: 0,
ms_inject_delay_type: ,
ms_inject_delay_max: 1,
ms_inject_delay_probability: 0,
ms_inject_internal_delays: 0,
mon_data: \/var\/lib\/ceph\/mon\/ceph-0,
mon_initial_members: ,
mon_sync_fs_threshold: 5,
mon_compact_on_start: false,
mon_compact_on_bootstrap: false,
mon_compact_on_trim: true,
mon_tick_interval: 5,
mon_subscribe_interval: 300,
mon_osd_laggy_halflife: 3600,
mon_osd_laggy_weight: 0.3,
mon_osd_adjust_heartbeat_grace: true,
mon_osd_adjust_down_out_interval: true,
mon_osd_auto_mark_in: false,
mon_osd_auto_mark_auto_out_in: true,
mon_osd_auto_mark_new_in: true,
mon_osd_down_out_interval: 300,
mon_osd_down_out_subtree_limit: rack,
mon_osd_min_up_ratio: 0.3,
mon_osd_min_in_ratio: 0.3,
mon_stat_smooth_intervals: 2,
mon_lease: 5,
mon_lease_renew_interval: 3,
mon_lease_ack_timeout: 10,
mon_clock_drift_allowed: 0.05,
mon_clock_drift_warn_backoff: 5,
mon_timecheck_interval: 300,
mon_accept_timeout: 10,
mon_pg_create_interval: 30,
mon_pg_stuck_threshold: 300,
mon_osd_full_ratio: 0.95,
mon_osd_nearfull_ratio: 0.85,
mon_globalid_prealloc: 100,
mon_osd_report_timeout: 900,
mon_force_standby_active: true,
mon_min_osdmap_epochs: 500,
mon_max_pgmap_epochs: 500,
mon_max_log_epochs: 500,
mon_max_osd: 1,
mon_probe_timeout: 2,
mon_slurp_timeout: 10,
mon_slurp_bytes: 262144,
mon_client_bytes: 104857600,
mon_daemon_bytes: 419430400,
mon_max_log_entries_per_event: 4096,
mon_health_data_update_interval: 

Re: [ceph-users] ceph-deploy progress and CDS session

2013-08-02 Thread Eric Eastman

Hi,

First I would like to state that with all its limitiation, I have 
managed to build multiple
clusters with ceph-deploy and without it, I would have been totally 
lost.  Things

that I feel would improve it include:

A debug mode where it lists everything it is doing.  This will be 
helpful in the future
when I move to a more integrated tool then ceph-deploy, as I could see 
exactly

how ceph-deploy built my test cluster.

To understand more types of linux storage devices. I have spent hours 
trying to make it
understand multipath devices, as I happen to have a large number of 
these in my lab,

but so far I have not made it work.

Really good documentation on all the ceph-deploy options.

Lastly, this is not just a ceph-deploy thing, but documentation 
explaining how things
boot up, and interact.  Ceph-deploy depends on tools like ceph-disk to 
mount OSD
disks on the servers during boot, and I learned the hard way that if a 
OSD is on a LUN
that is seen by more then one OSD node, you can corrupt data, as each 
OSD node

tries to mount all the ODS it can find.

There is a session at CDS scheduled to discuss ceph-deploy (4:40pm PDT 

on

Monday).  We'll be going over what we currently have in backlog for
improvements, but if you have any opinions about what else ceph-deploy
should or should not do or areas where it is problematic, please reply 

to
this thread to let us know what you think, and/or join the CDS 

discussion

hangout.



Thank
Eric
--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] ceph: Add check returned value on func ceph_calc_ceph_pg.

2013-08-02 Thread Sage Weil
Applied, thanks!

On Fri, 2 Aug 2013, majianpeng wrote:

 Func ceph_calc_ceph_pg maybe failed.So add check for returned value.
 
 Signed-off-by: Jianpeng Ma majianp...@gmail.com
 ---
  fs/ceph/ioctl.c | 8 ++--
  1 file changed, 6 insertions(+), 2 deletions(-)
 
 diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
 index e0b4ef3..8c463dd 100644
 --- a/fs/ceph/ioctl.c
 +++ b/fs/ceph/ioctl.c
 @@ -209,8 +209,12 @@ static long ceph_ioctl_get_dataloc(struct file *file, 
 void __user *arg)
   snprintf(dl.object_name, sizeof(dl.object_name), %llx.%08llx,
ceph_ino(inode), dl.object_no);
  
 - ceph_calc_ceph_pg(pgid, dl.object_name, osdc-osdmap,
 - ceph_file_layout_pg_pool(ci-i_layout));
 + r = ceph_calc_ceph_pg(pgid, dl.object_name, osdc-osdmap,
 + ceph_file_layout_pg_pool(ci-i_layout));
 + if (r  0) {
 + up_read(osdc-map_sem);
 + return r;
 + }
  
   dl.osd = ceph_calc_pg_primary(osdc-osdmap, pgid);
   if (dl.osd = 0) {
 -- 
 1.8.1.2
 
 
 Thanks!
 Jianpeng Ma
--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] ceph: fix bugs about handling short-read for sync read mode.

2013-08-02 Thread Sage Weil
BTW, I was going to add this to the testing branch but it doesn't apply to 
the current tree.  Can you rebase on top of ceph-client.git #testing?

Thanks!
sage


On Fri, 2 Aug 2013, majianpeng wrote:

 cephfs . show_layout
 layyout.data_pool: 0
 layout.object_size:   4194304
 layout.stripe_unit:   4194304
 layout.stripe_count:  1
 
 TestA:
 dd if=/dev/urandom of=test bs=1M count=2 oflag=direct
 dd if=/dev/urandom of=test bs=1M count=2 seek=4  oflag=direct
 dd if=test of=/dev/null bs=6M count=1 iflag=direct
 The messages from func striped_read are:
 ceph:   file.c:350  : striped_read 0~6291456 (read 0) got 2097152 
 HITSTRIPE SHORT
 ceph:   file.c:350  : striped_read 2097152~4194304 (read 2097152) got 
 0 HITSTRIPE SHORT
 ceph:   file.c:381  : zero tail 4194304
 ceph:   file.c:390  : striped_read returns 6291456
 The hole of file is from 2M--4M.But actualy it zero the last 4M include
 the last 2M area which isn't a hole.
 Using this patch, the messages are:
 ceph:   file.c:350  : striped_read 0~6291456 (read 0) got 2097152 
 HITSTRIPE SHORT
 ceph:   file.c:358  :  zero gap 2097152 to 4194304
 ceph:   file.c:350  : striped_read 4194304~2097152 (read 4194304) got 
 2097152
 ceph:   file.c:384  : striped_read returns 6291456
 
 TestB:
 echo majianpeng  test
 dd if=test of=/dev/null bs=2M count=1 iflag=direct
 The messages are:
 ceph:   file.c:350  : striped_read 0~6291456 (read 0) got 11 
 HITSTRIPE SHORT
 ceph:   file.c:350  : striped_read 11~6291445 (read 11) got 0 
 HITSTRIPE SHORT
 ceph:   file.c:390  : striped_read returns 11
 For this case,it did once more striped_read.It's no meaningless.
 Using this patch, the message are:
 ceph:   file.c:350  : striped_read 0~6291456 (read 0) got 11 
 HITSTRIPE SHORT
 ceph:   file.c:384  : striped_read returns 11
 
 Big thanks to Yan Zheng for the patch.
 
 Signed-off-by: Jianpeng Ma majianp...@gmail.com
 Reviewed-by: Yan, Zheng zheng.z@intel.com
 ---
  fs/ceph/file.c | 40 +---
  1 file changed, 17 insertions(+), 23 deletions(-)
 
 diff --git a/fs/ceph/file.c b/fs/ceph/file.c
 index 2ddf061..3d8d14d 100644
 --- a/fs/ceph/file.c
 +++ b/fs/ceph/file.c
 @@ -349,44 +349,38 @@ more:
   dout(striped_read %llu~%u (read %u) got %d%s%s\n, pos, left, read,
ret, hit_stripe ?  HITSTRIPE : , was_short ?  SHORT : );
  
 - if (ret  0) {
 - int didpages = (page_align + ret)  PAGE_CACHE_SHIFT;
 -
 - if (read  pos - off) {
 - dout( zero gap %llu to %llu\n, off + read, pos);
 - ceph_zero_page_vector_range(page_align + read,
 - pos - off - read, pages);
 + if (ret = 0) {
 + int  didpages;
 + if (was_short  (pos + ret  inode-i_size)) {
 + u64 tmp = min(this_len - ret,
 +  inode-i_size - pos - ret);
 + dout( zero gap %llu to %llu\n,
 + pos + ret, pos + ret + tmp);
 + ceph_zero_page_vector_range(page_align + read + ret,
 + tmp, pages);
 + ret += tmp;
   }
 +
 + didpages = (page_align + ret)  PAGE_CACHE_SHIFT;
   pos += ret;
   read = pos - off;
   left -= ret;
   page_pos += didpages;
   pages_left -= didpages;
  
 - /* hit stripe? */
 - if (left  hit_stripe)
 + /* hit stripe and need continue*/
 + if (left  hit_stripe  pos  inode-i_size)
   goto more;
 +
   }
  
 - if (was_short) {
 + if (ret = 0) {
 + ret = read;
   /* did we bounce off eof? */
   if (pos + left  inode-i_size)
   *checkeof = 1;
 -
 - /* zero trailing bytes (inside i_size) */
 - if (left  0  pos  inode-i_size) {
 - if (pos + left  inode-i_size)
 - left = inode-i_size - pos;
 -
 - dout(zero tail %d\n, left);
 - ceph_zero_page_vector_range(page_align + read, left,
 - pages);
 - read += left;
 - }
   }
  
 - if (ret = 0)
 - ret = read;
   dout(striped_read returns %d\n, ret);
   return ret;
  }
 -- 
 1.8.1.2
 
 
--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH V5 2/8] fs/ceph: vfs __set_page_dirty_nobuffers interface instead of doing it inside filesystem

2013-08-02 Thread Sage Weil
On Fri, 2 Aug 2013, Sha Zhengju wrote:
 On Fri, Aug 2, 2013 at 2:27 AM, Sage Weil s...@inktank.com wrote:
  On Thu, 1 Aug 2013, Yan, Zheng wrote:
  On Thu, Aug 1, 2013 at 7:51 PM, Sha Zhengju handai@gmail.com wrote:
   From: Sha Zhengju handai@taobao.com
  
   Following we will begin to add memcg dirty page accounting around
  __set_page_dirty_
   {buffers,nobuffers} in vfs layer, so we'd better use vfs interface to
  avoid exporting
   those details to filesystems.
  
   Signed-off-by: Sha Zhengju handai@taobao.com
   ---
fs/ceph/addr.c |   13 +
1 file changed, 1 insertion(+), 12 deletions(-)
  
   diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
   index 3e68ac1..1445bf1 100644
   --- a/fs/ceph/addr.c
   +++ b/fs/ceph/addr.c
   @@ -76,7 +76,7 @@ static int ceph_set_page_dirty(struct page *page)
   if (unlikely(!mapping))
   return !TestSetPageDirty(page);
  
   -   if (TestSetPageDirty(page)) {
   +   if (!__set_page_dirty_nobuffers(page)) {
  it's too early to set the radix tree tag here. We should set page's 
  snapshot
  context and increase the i_wrbuffer_ref first. This is because once the tag
  is set, writeback thread can find and start flushing the page.
 
  Unfortunately I only remember being frustrated by this code.  :)  Looking
  at it now, though, it seems like the minimum fix is to set the
  page-private before marking the page dirty.  I don't know the locking
  rules around that, though.  If that is potentially racy, maybe the safest
  thing would be if __set_page_dirty_nobuffers() took a void* to set
  page-private to atomically while holding the tree_lock.
 
 
 Sorry, I don't catch the point of your last sentence... Could you
 please explain it again?

It didn't make much sense.  :)  I was worried about multiple callers to 
set_page_dirty, but as understand it, this all happens under page-lock, 
right?  (There is a mention of other special cases in mm/page-writeback.c, 
but I'm hoping we don't need to worry about that.)

In any case, I suspect what we actually want is something like the below 
(untested) patch.  The snapc accounting can be ignored here because 
invalidatepage will clean it up...

sage



diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index afb2fc2..7602e46 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -76,9 +76,10 @@ static int ceph_set_page_dirty(struct page *page)
if (unlikely(!mapping))
return !TestSetPageDirty(page);
 
-   if (TestSetPageDirty(page)) {
+   if (PageDirty(page)) {
dout(%p set_page_dirty %p idx %lu -- already dirty\n,
 mapping-host, page, page-index);
+   BUG_ON(!PagePrivate(page));
return 0;
}
 
@@ -107,35 +108,16 @@ static int ceph_set_page_dirty(struct page *page)
 snapc, snapc-seq, snapc-num_snaps);
spin_unlock(ci-i_ceph_lock);
 
-   /* now adjust page */
-   spin_lock_irq(mapping-tree_lock);
-   if (page-mapping) {/* Race with truncate? */
-   WARN_ON_ONCE(!PageUptodate(page));
-   account_page_dirtied(page, page-mapping);
-   radix_tree_tag_set(mapping-page_tree,
-   page_index(page), PAGECACHE_TAG_DIRTY);
-
-   /*
-* Reference snap context in page-private.  Also set
-* PagePrivate so that we get invalidatepage callback.
-*/
-   page-private = (unsigned long)snapc;
-   SetPagePrivate(page);
-   } else {
-   dout(ANON set_page_dirty %p (raced truncate?)\n, page);
-   undo = 1;
-   }
-
-   spin_unlock_irq(mapping-tree_lock);
-
-   if (undo)
-   /* whoops, we failed to dirty the page */
-   ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
-
-   __mark_inode_dirty(mapping-host, I_DIRTY_PAGES);
+   /*
+* Reference snap context in page-private.  Also set
+* PagePrivate so that we get invalidatepage callback.
+*/
+   BUG_ON(PagePrivate(page));
+   page-private = (unsigned long)snapc;
+   SetPagePrivate(page);
 
-   BUG_ON(!PageDirty(page));
-   return 1;
+   return __set_page_dirty_nobuffers(page);
 }
 
 /*
--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


bug in /etc/init.d/ceph debian

2013-08-02 Thread James Harper
I'm running ceph 0.61.7-1~bpo70+1 and I think there is a bug in /etc/init.d/ceph

The heartbeat RA expects that the init.d script will return 3 for not 
running, but if there is no agent (eg mds) defined for that host it will 
return 0 instead, so pacemaker thinks the agent is running on a node where it 
isn't even defined and presumably would then start doing stonith when it finds 
it remains running after a stop command.

Or maybe that is the correct behaviour of the init.d script and the RA needs to 
be modified?

Thanks

James
--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html