Re: [PATCH v7 10/16] dlm: use new hashtable implementation
On Mon, Oct 29, 2012 at 12:07:10PM -0400, Mathieu Desnoyers wrote: I'm fine with turning a direct + modulo mapping into a dispersed hash as long as there are no underlying assumptions about sequentiality of value accesses. If the access pattern would happen to be typically sequential, then adding dispersion could hurt performances significantly, turning a frequent L1 access into a L2 access for instance. All I'm asking is: have you made sure that this hash table is not deliberately kept sequential (without dispersion) to accelerate specific access patterns ? This should at least be documented in the changelog. It was not intentional. I don't expect any benefit would be lost by making it non-sequential. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[patch 00/12] dlm: updates
A variety of things including bug fixes, cleanups, and a couple enhancements. Some of these updates are broad enough to cross the current dlm patch divisions in -mm. Dave -- - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[patch 04/12] dlm: node weights
Use node weights in directory mapping. Allows nodes to be configured to be responsible for more or less of the directory. Signed-off-by: David Teigland [EMAIL PROTECTED] Index: linux-2.6.12-mm1/drivers/dlm/dir.c === --- linux-2.6.12-mm1.orig/drivers/dlm/dir.c +++ linux-2.6.12-mm1/drivers/dlm/dir.c @@ -89,13 +89,17 @@ int dlm_dir_name2nodeid(struct dlm_ls *l } hash = dlm_hash(name, length); - node = (hash 16) % ls-ls_num_nodes; if (ls-ls_node_array) { + node = (hash 16) % ls-ls_total_weight; nodeid = ls-ls_node_array[node]; goto out; } + /* make_member_array() failed to kmalloc ls_node_array... */ + + node = (hash 16) % ls-ls_num_nodes; + list_for_each(tmp, ls-ls_nodes) { if (n++ != node) continue; Index: linux-2.6.12-mm1/drivers/dlm/dlm_internal.h === --- linux-2.6.12-mm1.orig/drivers/dlm/dlm_internal.h +++ linux-2.6.12-mm1/drivers/dlm/dlm_internal.h @@ -134,6 +134,7 @@ struct dlm_member { struct list_headlist; int nodeid; int gone_event; + int weight; }; /* @@ -457,6 +458,7 @@ struct dlm_ls { struct list_headls_nodes_gone; /* dead node list, recovery */ int ls_num_nodes; /* number of nodes in ls */ int ls_low_nodeid; + int ls_total_weight; int *ls_node_array; int *ls_nodeids_next; int ls_nodeids_next_count; Index: linux-2.6.12-mm1/drivers/dlm/lowcomms.c === --- linux-2.6.12-mm1.orig/drivers/dlm/lowcomms.c +++ linux-2.6.12-mm1/drivers/dlm/lowcomms.c @@ -254,6 +254,19 @@ static int nodeid_to_addr(int nodeid, st return 0; } +int dlm_node_weight(int nodeid) +{ + struct dlm_node *node; + int weight = -1; + + down(nodes_sem); + node = search_node(nodeid); + if (node) + weight = node-weight; + up(nodes_sem); + return weight; +} + int dlm_set_node(int nodeid, int weight, char *addr_buf) { struct dlm_node *node; Index: linux-2.6.12-mm1/drivers/dlm/lowcomms.h === --- linux-2.6.12-mm1.orig/drivers/dlm/lowcomms.h +++ linux-2.6.12-mm1/drivers/dlm/lowcomms.h @@ -23,6 +23,7 @@ void dlm_lowcomms_commit_buffer(void *mh int dlm_set_node(int nodeid, int weight, char *addr_buf); int dlm_set_local(int nodeid, int weight, char *addr_buf); int dlm_our_nodeid(void); +int dlm_node_weight(int nodeid); #endif /* __LOWCOMMS_DOT_H__ */ Index: linux-2.6.12-mm1/drivers/dlm/member.c === --- linux-2.6.12-mm1.orig/drivers/dlm/member.c +++ linux-2.6.12-mm1/drivers/dlm/member.c @@ -56,6 +56,7 @@ static int dlm_add_member(struct dlm_ls return -ENOMEM; memb-nodeid = nodeid; + memb-weight = dlm_node_weight(nodeid); add_ordered_member(ls, memb); ls-ls_num_nodes++; return 0; @@ -126,19 +127,43 @@ void dlm_clear_members_finish(struct dlm static void make_member_array(struct dlm_ls *ls) { struct dlm_member *memb; - int i = 0, *array; + int i, w, x = 0, total = 0, all_zero = 0, *array; - if (ls-ls_node_array) { - kfree(ls-ls_node_array); - ls-ls_node_array = NULL; + kfree(ls-ls_node_array); + ls-ls_node_array = NULL; + + list_for_each_entry(memb, ls-ls_nodes, list) { + if (memb-weight) + total += memb-weight; } - array = kmalloc(sizeof(int) * ls-ls_num_nodes, GFP_KERNEL); + /* all nodes revert to weight of 1 if all have weight 0 */ + + if (!total) { + total = ls-ls_num_nodes; + all_zero = 1; + } + + ls-ls_total_weight = total; + + array = kmalloc(sizeof(int) * total, GFP_KERNEL); if (!array) return; - list_for_each_entry(memb, ls-ls_nodes, list) - array[i++] = memb-nodeid; + list_for_each_entry(memb, ls-ls_nodes, list) { + if (!all_zero !memb-weight) + continue; + + if (all_zero) + w = 1; + else + w = memb-weight; + + DLM_ASSERT(x total, printk(total %d x %d\n, total, x);); + + for (i = 0; i w; i++) + array[x++] = memb-nodeid; + } ls-ls_node_array = array; } -- - To unsubscribe from this list: send the line unsubscribe linux
[patch 03/12] dlm: make code static
This patch makes needlessly global code static. Signed-off-by: Adrian Bunk [EMAIL PROTECTED] Signed-off-by: David Teigland [EMAIL PROTECTED] Index: linux-2.6.12-mm1/drivers/dlm/lock.c === --- linux-2.6.12-mm1.orig/drivers/dlm/lock.c +++ linux-2.6.12-mm1/drivers/dlm/lock.c @@ -92,7 +92,7 @@ static int receive_extralen(struct dlm_m * Usage: matrix[grmode+1][rqmode+1] (although m[rq+1][gr+1] is the same) */ -const int __dlm_compat_matrix[8][8] = { +static const int __dlm_compat_matrix[8][8] = { /* UN NL CR CW PR PW EX PD */ {1, 1, 1, 1, 1, 1, 1, 0}, /* UN */ {1, 1, 1, 1, 1, 1, 1, 0}, /* NL */ @@ -139,7 +139,7 @@ int dlm_modes_compat(int mode1, int mode * Usage: matrix[grmode+1][rqmode+1] */ -const int __quecvt_compat_matrix[8][8] = { +static const int __quecvt_compat_matrix[8][8] = { /* UN NL CR CW PR PW EX PD */ {0, 0, 0, 0, 0, 0, 0, 0}, /* UN */ {0, 0, 1, 1, 1, 1, 1, 0}, /* NL */ @@ -1630,8 +1630,8 @@ static int set_unlock_args(uint32_t flag return 0; } -int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, - struct dlm_args *args) +static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, + struct dlm_args *args) { int rv = -EINVAL; @@ -1685,7 +1685,7 @@ int validate_lock_args(struct dlm_ls *ls return rv; } -int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args) +static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args) { int rv = -EINVAL; Index: linux-2.6.12-mm1/drivers/dlm/lockspace.c === --- linux-2.6.12-mm1.orig/drivers/dlm/lockspace.c +++ linux-2.6.12-mm1/drivers/dlm/lockspace.c @@ -47,7 +47,7 @@ int dlm_lockspace_init(void) return 0; } -int dlm_scand(void *data) +static int dlm_scand(void *data) { struct dlm_ls *ls; @@ -60,7 +60,7 @@ int dlm_scand(void *data) return 0; } -int dlm_scand_start(void) +static int dlm_scand_start(void) { struct task_struct *p; int error = 0; @@ -73,7 +73,7 @@ int dlm_scand_start(void) return error; } -void dlm_scand_stop(void) +static void dlm_scand_stop(void) { kthread_stop(scand_task); } Index: linux-2.6.12-mm1/drivers/dlm/main.c === --- linux-2.6.12-mm1.orig/drivers/dlm/main.c +++ linux-2.6.12-mm1/drivers/dlm/main.c @@ -30,7 +30,7 @@ static inline void dlm_unregister_debugf int dlm_node_ioctl_init(void); void dlm_node_ioctl_exit(void); -int __init init_dlm(void) +static int __init init_dlm(void) { int error; @@ -74,7 +74,7 @@ int __init init_dlm(void) return error; } -void __exit exit_dlm(void) +static void __exit exit_dlm(void) { dlm_lowcomms_exit(); dlm_member_sysfs_exit(); Index: linux-2.6.12-mm1/drivers/dlm/member.c === --- linux-2.6.12-mm1.orig/drivers/dlm/member.c +++ linux-2.6.12-mm1/drivers/dlm/member.c @@ -47,7 +47,7 @@ static void add_ordered_member(struct dl } } -int dlm_add_member(struct dlm_ls *ls, int nodeid) +static int dlm_add_member(struct dlm_ls *ls, int nodeid) { struct dlm_member *memb; @@ -61,13 +61,13 @@ int dlm_add_member(struct dlm_ls *ls, in return 0; } -void dlm_remove_member(struct dlm_ls *ls, struct dlm_member *memb) +static void dlm_remove_member(struct dlm_ls *ls, struct dlm_member *memb) { list_move(memb-list, ls-ls_nodes_gone); ls-ls_num_nodes--; } -int dlm_is_member(struct dlm_ls *ls, int nodeid) +static int dlm_is_member(struct dlm_ls *ls, int nodeid) { struct dlm_member *memb; Index: linux-2.6.12-mm1/drivers/dlm/recover.c === --- linux-2.6.12-mm1.orig/drivers/dlm/recover.c +++ linux-2.6.12-mm1/drivers/dlm/recover.c @@ -235,7 +235,7 @@ static struct dlm_rsb *recover_list_find return r; } -void recover_list_clear(struct dlm_ls *ls) +static void recover_list_clear(struct dlm_ls *ls) { struct dlm_rsb *r, *s; Index: linux-2.6.12-mm1/drivers/dlm/recoverd.c === --- linux-2.6.12-mm1.orig/drivers/dlm/recoverd.c +++ linux-2.6.12-mm1/drivers/dlm/recoverd.c @@ -658,7 +658,7 @@ static void do_ls_recovery(struct dlm_ls } } -int dlm_recoverd(void *arg) +static int dlm_recoverd(void *arg) { struct dlm_ls *ls; -- - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[patch 12/12] dlm: fix device refcount
An extra refcount was being left on devices. Signed-off-by: Patrick Caulfield [EMAIL PROTECTED] Signed-off-by: David Teigland [EMAIL PROTECTED] Index: linux/drivers/dlm/device.c === --- linux.orig/drivers/dlm/device.c +++ linux/drivers/dlm/device.c @@ -449,8 +449,8 @@ static int dlm_open(struct inode *inode, spin_lock_init(f-fi_ast_lock); init_waitqueue_head(f-fi_wait); f-fi_ls = lsinfo; - atomic_set(f-fi_refcnt, 1); f-fi_flags = 0; + get_file_info(f); set_bit(1, f-fi_flags); file-private_data = f; @@ -602,6 +602,7 @@ static int dlm_close(struct inode *inode } } up(user_ls_lock); + put_file_info(f); /* Restore signals */ sigprocmask(SIG_SETMASK, tmpsig, NULL); -- - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[patch 11/12] dlm: return error in status reply
When a lockspace on a remote node is not found for a recovery status request, an error needs to be returned so the requesting node can distinguish it from a normal reply with a zero status. Signed-off-by: David Teigland [EMAIL PROTECTED] Index: linux/drivers/dlm/rcom.c === --- linux.orig/drivers/dlm/rcom.c +++ linux/drivers/dlm/rcom.c @@ -78,13 +78,13 @@ static void make_config(struct dlm_ls *l rf-rf_lsflags = ls-ls_exflags; } -static int check_config(struct dlm_ls *ls, struct rcom_config *rf) +static int check_config(struct dlm_ls *ls, struct rcom_config *rf, int nodeid) { if (rf-rf_lvblen != ls-ls_lvblen || rf-rf_lsflags != ls-ls_exflags) { - log_error(ls, config mismatch %d,%d %x,%x, - rf-rf_lvblen, ls-ls_lvblen, - rf-rf_lsflags, ls-ls_exflags); + log_error(ls, config mismatch: %d,%x nodeid %d: %d,%x, + ls-ls_lvblen, ls-ls_exflags, + nodeid, rf-rf_lvblen, rf-rf_lsflags); return -EINVAL; } return 0; @@ -116,7 +116,15 @@ int dlm_rcom_status(struct dlm_ls *ls, i goto out; rc = (struct dlm_rcom *) ls-ls_recover_buf; - error = check_config(ls, (struct rcom_config *) rc-rc_buf); + + if (rc-rc_result == -ESRCH) { + /* we pretend the remote lockspace exists with 0 status */ + log_debug(ls, remote node %d not ready, nodeid); + rc-rc_result = 0; + } else + error = check_config(ls, (struct rcom_config *) rc-rc_buf, +nodeid); + /* the caller looks at rc_result for the remote recovery status */ out: return error; } @@ -369,7 +377,7 @@ static int send_ls_not_ready(int nodeid, rc-rc_header.h_cmd = DLM_RCOM; rc-rc_type = DLM_RCOM_STATUS_REPLY; - rc-rc_result = 0; + rc-rc_result = -ESRCH; dlm_rcom_out(rc); dlm_lowcomms_commit_buffer(mh); @@ -392,6 +400,8 @@ void dlm_receive_rcom(struct dlm_header ls = dlm_find_lockspace_global(hd-h_lockspace); if (!ls) { + log_print(lockspace %x from %d not found, + hd-h_lockspace, nodeid); send_ls_not_ready(nodeid, rc); return; } -- - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[patch 10/12] dlm: release list of root rsbs
The list of root rsb's created during recovery needs to be released if recovery is aborted early. Signed-off-by: David Teigland [EMAIL PROTECTED] Index: linux/drivers/dlm/recoverd.c === --- linux.orig/drivers/dlm/recoverd.c +++ linux/drivers/dlm/recoverd.c @@ -205,6 +205,7 @@ static int ls_recover(struct dlm_ls *ls, return 0; fail: + dlm_release_root_list(ls); log_debug(ls, recover %PRIx64 error %d, rv-seq, error); up(ls-ls_recoverd_active); return error; -- - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[patch 09/12] dlm: clear NEW_MASTER flag
If recover_locks() on an rsb doesn't find any locks to recover, we need to clear the NEW_MASTER flag since it won't be cleared by dlm_recovered_lock(). Signed-off-by: David Teigland [EMAIL PROTECTED] Index: linux/drivers/dlm/recover.c === --- linux.orig/drivers/dlm/recover.c +++ linux/drivers/dlm/recover.c @@ -502,6 +502,8 @@ static int recover_locks(struct dlm_rsb if (r-res_recover_locks_count) recover_list_add(r); + else + rsb_clear_flag(r, RSB_NEW_MASTER); out: unlock_rsb(r); return error; @@ -553,6 +555,8 @@ int dlm_recover_locks(struct dlm_ls *ls) void dlm_recovered_lock(struct dlm_rsb *r) { + DLM_ASSERT(rsb_flag(r, RSB_NEW_MASTER), dlm_print_rsb(r);); + r-res_recover_locks_count--; if (!r-res_recover_locks_count) { rsb_clear_flag(r, RSB_NEW_MASTER); -- - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[patch 08/12] dlm: no directory option
Per-lockspace option for dlm to run without using a resource directory. What would be the directory node for a resource is statically assigned to be the master node instead. Signed-off-by: David Teigland [EMAIL PROTECTED] Index: linux/drivers/dlm/dir.c === --- linux.orig/drivers/dlm/dir.c +++ linux/drivers/dlm/dir.c @@ -72,15 +72,14 @@ void dlm_clear_free_entries(struct dlm_l * * To give the exact range wanted (0 to num_nodes-1), we apply a modulus of * num_nodes to the hash value. This value in the desired range is used as an - * offset into the sorted list of nodeid's to give the particular nodeid of the - * directory node. + * offset into the sorted list of nodeid's to give the particular nodeid. */ -int dlm_dir_name2nodeid(struct dlm_ls *ls, char *name, int length) +int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash) { struct list_head *tmp; struct dlm_member *memb = NULL; - uint32_t hash, node, n = 0; + uint32_t node, n = 0; int nodeid; if (ls-ls_num_nodes == 1) { @@ -88,8 +87,6 @@ int dlm_dir_name2nodeid(struct dlm_ls *l goto out; } - hash = dlm_hash(name, length); - if (ls-ls_node_array) { node = (hash 16) % ls-ls_total_weight; nodeid = ls-ls_node_array[node]; @@ -114,9 +111,9 @@ int dlm_dir_name2nodeid(struct dlm_ls *l return nodeid; } -int dlm_dir_nodeid(struct dlm_rsb *rsb) +int dlm_dir_nodeid(struct dlm_rsb *r) { - return dlm_dir_name2nodeid(rsb-res_ls, rsb-res_name, rsb-res_length); + return dlm_hash2nodeid(r-res_ls, r-res_hash); } static inline uint32_t dir_hash(struct dlm_ls *ls, char *name, int len) @@ -202,12 +199,15 @@ int dlm_recover_directory(struct dlm_ls { struct dlm_member *memb; struct dlm_direntry *de; - char *b, *last_name; + char *b, *last_name = NULL; int error = -ENOMEM, last_len, count = 0; uint16_t namelen; log_debug(ls, dlm_recover_directory); + if (dlm_no_directory(ls)) + goto out_status; + dlm_dir_clear(ls); last_name = kmalloc(DLM_RESNAME_MAXLEN, GFP_KERNEL); @@ -221,12 +221,12 @@ int dlm_recover_directory(struct dlm_ls for (;;) { error = dlm_recovery_stopped(ls); if (error) - goto free_last; + goto out_free; error = dlm_rcom_names(ls, memb-nodeid, last_name, last_len); if (error) - goto free_last; + goto out_free; schedule(); @@ -253,7 +253,7 @@ int dlm_recover_directory(struct dlm_ls error = -ENOMEM; de = get_free_de(ls, namelen); if (!de) - goto free_last; + goto out_free; de-master_nodeid = memb-nodeid; de-length = namelen; @@ -270,12 +270,11 @@ int dlm_recover_directory(struct dlm_ls ; } - dlm_set_recover_status(ls, DLM_RS_DIR); + out_status: error = 0; - + dlm_set_recover_status(ls, DLM_RS_DIR); log_debug(ls, dlm_recover_directory %d entries, count); - - free_last: + out_free: kfree(last_name); out: dlm_clear_free_entries(ls); Index: linux/drivers/dlm/dir.h === --- linux.orig/drivers/dlm/dir.h +++ linux/drivers/dlm/dir.h @@ -16,7 +16,7 @@ int dlm_dir_nodeid(struct dlm_rsb *rsb); -int dlm_dir_name2nodeid(struct dlm_ls *ls, char *name, int length); +int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash); void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int len); void dlm_dir_clear(struct dlm_ls *ls); void dlm_clear_free_entries(struct dlm_ls *ls); Index: linux/drivers/dlm/dlm_internal.h === --- linux.orig/drivers/dlm/dlm_internal.h +++ linux/drivers/dlm/dlm_internal.h @@ -270,6 +270,7 @@ struct dlm_rsb { int res_length; /* length of rsb name */ int res_nodeid; uint32_tres_lvbseq; + uint32_tres_hash; uint32_tres_bucket; /* rsbtbl */ unsigned long res_toss_time; uint32_tres_first_lkid; @@ -364,6 +365,7 @@ struct dlm_message { uint32_tm_sbflags; uint32_tm_flags; uint32_tm_lvbseq; + uint32_tm_hash; int
[patch 07/12] dlm: better handling of first lock
The first lock taken on an rsb is treated specially because the resource master needs to be looked up. There were some potential problems with this during recovery, and the whole thing was becoming too complex. This simplifies the special first-lock case and solves the problems. Signed-off-by: David Teigland [EMAIL PROTECTED] Index: linux-2.6.12-mm1/drivers/dlm/debug_fs.c === --- linux-2.6.12-mm1.orig/drivers/dlm/debug_fs.c +++ linux-2.6.12-mm1/drivers/dlm/debug_fs.c @@ -99,11 +99,16 @@ static int print_resource(struct dlm_rsb else seq_printf(s, %c, '.'); } - if (res-res_nodeid) + if (res-res_nodeid 0) seq_printf(s, \ \nLocal Copy, Master is node %d\n, res-res_nodeid); - else + else if (res-res_nodeid == 0) seq_printf(s, \ \nMaster Copy\n); + else if (res-res_nodeid == -1) + seq_printf(s, \ \nLooking up master (lkid %x)\n, + res-res_first_lkid); + else + seq_printf(s, \ \nInvalid master %d\n, res-res_nodeid); /* Print the LVB: */ if (res-res_lvbptr) { Index: linux-2.6.12-mm1/drivers/dlm/dlm_internal.h === --- linux-2.6.12-mm1.orig/drivers/dlm/dlm_internal.h +++ linux-2.6.12-mm1/drivers/dlm/dlm_internal.h @@ -272,8 +272,8 @@ struct dlm_rsb { uint32_tres_lvbseq; uint32_tres_bucket; /* rsbtbl */ unsigned long res_toss_time; - uint32_tres_trial_lkid; /* lkb trying lookup result */ - struct list_headres_lookup; /* lkbs waiting lookup confirm*/ + uint32_tres_first_lkid; + struct list_headres_lookup; /* lkbs waiting on first */ struct list_headres_hashchain; /* rsbtbl */ struct list_headres_grantqueue; struct list_headres_convertqueue; @@ -295,7 +295,6 @@ struct dlm_rsb { /* rsb_flags */ enum rsb_flags { - RSB_MASTER_WAIT, RSB_MASTER_UNCERTAIN, RSB_VALNOTVALID, RSB_VALNOTVALID_PREV, Index: linux-2.6.12-mm1/drivers/dlm/lock.c === --- linux-2.6.12-mm1.orig/drivers/dlm/lock.c +++ linux-2.6.12-mm1/drivers/dlm/lock.c @@ -112,6 +112,7 @@ static const int __dlm_compat_matrix[8][ * 0 = LVB is written to the resource * -1 = nothing happens to the LVB */ + const int dlm_lvb_operations[8][8] = { /* UN NL CR CW PR PW EX PD*/ { -1, 1, 1, 1, 1, 1, 1, -1 }, /* UN */ @@ -162,8 +163,8 @@ void dlm_print_lkb(struct dlm_lkb *lkb) void dlm_print_rsb(struct dlm_rsb *r) { - printk(KERN_ERR rsb: nodeid %d flags %lx trial %x rlc %d name %s\n, - r-res_nodeid, r-res_flags, r-res_trial_lkid, + printk(KERN_ERR rsb: nodeid %d flags %lx first %x rlc %d name %s\n, + r-res_nodeid, r-res_flags, r-res_first_lkid, r-res_recover_locks_count, r-res_name); } @@ -317,16 +318,13 @@ static int _search_rsb(struct dlm_ls *ls list_move(r-res_hashchain, ls-ls_rsbtbl[b].list); if (r-res_nodeid == -1) { - rsb_clear_flag(r, RSB_MASTER_WAIT); rsb_clear_flag(r, RSB_MASTER_UNCERTAIN); - r-res_trial_lkid = 0; + r-res_first_lkid = 0; } else if (r-res_nodeid 0) { - rsb_clear_flag(r, RSB_MASTER_WAIT); rsb_set_flag(r, RSB_MASTER_UNCERTAIN); - r-res_trial_lkid = 0; + r-res_first_lkid = 0; } else { DLM_ASSERT(r-res_nodeid == 0, dlm_print_rsb(r);); - DLM_ASSERT(!rsb_flag(r, RSB_MASTER_WAIT), dlm_print_rsb(r);); DLM_ASSERT(!rsb_flag(r, RSB_MASTER_UNCERTAIN),); } out: @@ -1398,19 +1396,10 @@ static void send_blocking_asts_all(struc lookup reply. Other lkb's waiting for the same rsb lookup are kept on the rsb's res_lookup list until the master is verified. - After a remote lookup or when a tossed rsb is retrived that specifies - a remote master, that master value is uncertain -- it may have changed - by the time we send it a request. While it's uncertain, only one lkb - is allowed to go ahead and use the master value; that lkb is specified - by res_trial_lkid. Once the trial lkb is queued on the master node - we know the rsb master is correct and any other lkbs on res_lookup - can get the rsb nodeid and go ahead with their request. - Return values: 0: nodeid is set in rsb/lkb and the caller should go ahead and use it 1: the rsb master is not available and the lkb has been placed on a wait queue - -EXXX: there was some error in processing */ static int set_master(struct dlm_rsb *r
[patch 05/12] dlm: rsb flag ops with inlined functions
Replace test/set/clear_bit of rsb flags with new inline functions that use the less expense non-atomic bit ops. Signed-off-by: David Teigland [EMAIL PROTECTED] Index: linux-2.6.12-mm1/drivers/dlm/debug_fs.c === --- linux-2.6.12-mm1.orig/drivers/dlm/debug_fs.c +++ linux-2.6.12-mm1/drivers/dlm/debug_fs.c @@ -114,7 +114,7 @@ static int print_resource(struct dlm_rsb seq_printf(s, %02x , (unsigned char) res-res_lvbptr[i]); } - if (test_bit(RESFL_VALNOTVALID, res-res_flags)) + if (rsb_flag(res, RSB_VALNOTVALID)) seq_printf(s, (INVALID)); seq_printf(s, \n); } Index: linux-2.6.12-mm1/drivers/dlm/dlm_internal.h === --- linux-2.6.12-mm1.orig/drivers/dlm/dlm_internal.h +++ linux-2.6.12-mm1/drivers/dlm/dlm_internal.h @@ -262,25 +262,11 @@ struct dlm_lkb { longlkb_astparam; /* caller's ast arg */ }; - -/* find_rsb() flags */ - -#define R_MASTER 1 /* only return rsb if it's a master */ -#define R_CREATE 2 /* create/add rsb if not found */ - -#define RESFL_MASTER_WAIT 0 -#define RESFL_MASTER_UNCERTAIN 1 -#define RESFL_VALNOTVALID 2 -#define RESFL_VALNOTVALID_PREV 3 -#define RESFL_NEW_MASTER 4 -#define RESFL_NEW_MASTER2 5 -#define RESFL_RECOVER_CONVERT 6 - struct dlm_rsb { struct dlm_ls *res_ls;/* the lockspace */ struct kref res_ref; struct semaphoreres_sem; - unsigned long res_flags; /* RESFL_ */ + unsigned long res_flags; int res_length; /* length of rsb name */ int res_nodeid; uint32_tres_lvbseq; @@ -301,6 +287,38 @@ struct dlm_rsb { charres_name[1]; }; +/* find_rsb() flags */ + +#define R_MASTER 1 /* only return rsb if it's a master */ +#define R_CREATE 2 /* create/add rsb if not found */ + +/* rsb_flags */ + +enum rsb_flags { + RSB_MASTER_WAIT, + RSB_MASTER_UNCERTAIN, + RSB_VALNOTVALID, + RSB_VALNOTVALID_PREV, + RSB_NEW_MASTER, + RSB_NEW_MASTER2, + RSB_RECOVER_CONVERT, +}; + +static inline void rsb_set_flag(struct dlm_rsb *r, enum rsb_flags flag) +{ + __set_bit(flag, r-res_flags); +} + +static inline void rsb_clear_flag(struct dlm_rsb *r, enum rsb_flags flag) +{ + __clear_bit(flag, r-res_flags); +} + +static inline int rsb_flag(struct dlm_rsb *r, enum rsb_flags flag) +{ + return test_bit(flag, r-res_flags); +} + /* dlm_header is first element of all structs sent between nodes */ Index: linux-2.6.12-mm1/drivers/dlm/lock.c === --- linux-2.6.12-mm1.orig/drivers/dlm/lock.c +++ linux-2.6.12-mm1/drivers/dlm/lock.c @@ -317,18 +317,17 @@ static int _search_rsb(struct dlm_ls *ls list_move(r-res_hashchain, ls-ls_rsbtbl[b].list); if (r-res_nodeid == -1) { - clear_bit(RESFL_MASTER_WAIT, r-res_flags); - clear_bit(RESFL_MASTER_UNCERTAIN, r-res_flags); + rsb_clear_flag(r, RSB_MASTER_WAIT); + rsb_clear_flag(r, RSB_MASTER_UNCERTAIN); r-res_trial_lkid = 0; } else if (r-res_nodeid 0) { - clear_bit(RESFL_MASTER_WAIT, r-res_flags); - set_bit(RESFL_MASTER_UNCERTAIN, r-res_flags); + rsb_clear_flag(r, RSB_MASTER_WAIT); + rsb_set_flag(r, RSB_MASTER_UNCERTAIN); r-res_trial_lkid = 0; } else { DLM_ASSERT(r-res_nodeid == 0, dlm_print_rsb(r);); - DLM_ASSERT(!test_bit(RESFL_MASTER_WAIT, r-res_flags), - dlm_print_rsb(r);); - DLM_ASSERT(!test_bit(RESFL_MASTER_UNCERTAIN, r-res_flags),); + DLM_ASSERT(!rsb_flag(r, RSB_MASTER_WAIT), dlm_print_rsb(r);); + DLM_ASSERT(!rsb_flag(r, RSB_MASTER_UNCERTAIN),); } out: *r_ret = r; @@ -837,7 +836,7 @@ static void set_lvb_lock(struct dlm_rsb } else if (b == 0) { if (lkb-lkb_exflags DLM_LKF_IVVALBLK) { - set_bit(RESFL_VALNOTVALID, r-res_flags); + rsb_set_flag(r, RSB_VALNOTVALID); return; } @@ -856,10 +855,10 @@ static void set_lvb_lock(struct dlm_rsb memcpy(r-res_lvbptr, lkb-lkb_lvbptr, len); r-res_lvbseq++; lkb-lkb_lvbseq = r-res_lvbseq; - clear_bit(RESFL_VALNOTVALID, r-res_flags); + rsb_clear_flag(r, RSB_VALNOTVALID); } - if (test_bit(RESFL_VALNOTVALID, r-res_flags
[patch 01/12] dlm: fix lowcomms race
Fix potential race in lowcomms. Signed-off-by: Patrick Caulfield [EMAIL PROTECTED] Signed-off-by: David Teigland [EMAIL PROTECTED] Index: linux-2.6.12-mm1/drivers/dlm/lowcomms.c === --- linux-2.6.12-mm1.orig/drivers/dlm/lowcomms.c +++ linux-2.6.12-mm1/drivers/dlm/lowcomms.c @@ -1101,8 +1101,8 @@ static void process_output_queue(void) list_for_each_safe(list, temp, write_nodes) { struct nodeinfo *ni = list_entry(list, struct nodeinfo, write_list); - list_del(ni-write_list); clear_bit(NI_WRITE_PENDING, ni-flags); + list_del(ni-write_list); spin_unlock_bh(write_nodes_lock); @@ -1271,11 +1271,7 @@ static int daemons_start(void) /* * This is quite likely to sleep... - * Temporarily initialise the waitq head so that lowcomms_send_message - * doesn't crash if it gets called before the thread is fully - * initialised */ - int dlm_lowcomms_start(void) { int error; -- - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[patch 02/12] dlm: resend lookups
During recovery, set the RESEND flag on locks waiting for a lookup so they'll be resent when recovery completes. Signed-off-by: David Teigland [EMAIL PROTECTED] Index: linux-2.6.12-mm1/drivers/dlm/lock.c === --- linux-2.6.12-mm1.orig/drivers/dlm/lock.c +++ linux-2.6.12-mm1/drivers/dlm/lock.c @@ -3212,12 +3212,20 @@ void dlm_recover_waiters_pre(struct dlm_ down(ls-ls_waiters_sem); list_for_each_entry_safe(lkb, safe, ls-ls_waiters, lkb_wait_reply) { - if (!dlm_is_removed(ls, lkb-lkb_nodeid)) - continue; - log_debug(ls, pre recover waiter lkid %x type %d flags %x, lkb-lkb_id, lkb-lkb_wait_type, lkb-lkb_flags); + /* all outstanding lookups, regardless of destination will be + resent after recovery is done */ + + if (lkb-lkb_wait_type == DLM_MSG_LOOKUP) { + lkb-lkb_flags |= DLM_IFL_RESEND; + continue; + } + + if (!dlm_is_removed(ls, lkb-lkb_nodeid)) + continue; + switch (lkb-lkb_wait_type) { case DLM_MSG_REQUEST: @@ -3244,11 +3252,6 @@ void dlm_recover_waiters_pre(struct dlm_ put_lkb(lkb); break; - case DLM_MSG_LOOKUP: - /* all outstanding lookups, regardless of dest. - will be resent after recovery is done */ - break; - default: log_error(ls, invalid lkb wait_type %d, lkb-lkb_wait_type); -- - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[RFC] nodemanager, ocfs2, dlm
Some of the comments about the dlm concerned how it's configured (from user space.) In particular, there was interest in seeing the dlm and ocfs2 use common methods for their configuration. The first area I'm looking at is how we get addresses/ids of other nodes. Currently, the dlm uses an ioctl on a misc device and ocfs2 uses a separate kernel module called ocfs2_nodemanager that's based on configfs. I've taken a stab at generalizing ocfs2_nodemanager so the dlm could use it (removing ocfs-specific stuff). It still needs some work, but I'd like to know if this appeals to the ocfs group and to others who were interested in seeing some similarity in dlm/ocfs configuration. Thanks, Dave diff -urN a/drivers/Kconfig b/drivers/Kconfig --- a/drivers/Kconfig 2005-07-18 13:40:31.011368352 +0800 +++ b/drivers/Kconfig 2005-07-18 13:46:17.661669496 +0800 @@ -68,4 +68,6 @@ source drivers/dlm/Kconfig +source drivers/nodemanager/Kconfig + endmenu diff -urN a/drivers/Makefile b/drivers/Makefile --- a/drivers/Makefile 2005-07-18 13:40:31.015367744 +0800 +++ b/drivers/Makefile 2005-07-18 13:46:06.846313680 +0800 @@ -70,3 +70,4 @@ obj-y += firmware/ obj-$(CONFIG_CRYPTO) += crypto/ obj-$(CONFIG_DLM) += dlm/ +obj-$(CONFIG_NODEMANAGER) += nodemanager/ diff -urN a/drivers/nodemanager/Kconfig b/drivers/nodemanager/Kconfig --- a/drivers/nodemanager/Kconfig 1970-01-01 07:30:00.0 +0730 +++ b/drivers/nodemanager/Kconfig 2005-07-18 13:52:16.449125512 +0800 @@ -0,0 +1,9 @@ +menu Node Manager + +config NODEMANAGER + tristate Node Manager + help + Node addresses and IDs are provided from user space and made + available to kernel components from this module. + +endmenu diff -urN a/drivers/nodemanager/Makefile b/drivers/nodemanager/Makefile --- a/drivers/nodemanager/Makefile 1970-01-01 07:30:00.0 +0730 +++ b/drivers/nodemanager/Makefile 2005-07-18 13:45:52.620476336 +0800 @@ -0,0 +1,3 @@ +obj-$(CONFIG_NODEMANAGER) += nodemanager.o + +nodemanager-y := nodemanager.o diff -urN a/drivers/nodemanager/nodemanager.c b/drivers/nodemanager/nodemanager.c --- a/drivers/nodemanager/nodemanager.c 1970-01-01 07:30:00.0 +0730 +++ b/drivers/nodemanager/nodemanager.c 2005-07-18 13:55:17.043670968 +0800 @@ -0,0 +1,655 @@ +/* + * nodemanager.c + * + * Copyright (C) 2004, 2005 Oracle. All rights reserved. + * Copyright (C) 2005 Red Hat, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +/* TODO: + - generic addresses (IPV4/6) + - multiple addresses per node + - more than 255 nodes (no static MAXNODE array) + - function to get a list of all nodes +*/ + +#include linux/kernel.h +#include linux/module.h +#include linux/idr.h +#include linux/configfs.h + +#include nodemanager.h + +enum { + NM_NODE_ATTR_NODEID = 0, + NM_NODE_ATTR_ADDRESS, + NM_NODE_ATTR_LOCAL, +}; + +struct clusters; +struct cluster; +struct nodes; +struct node; + +static ssize_t node_nodeid_read(struct node *, char *); +static ssize_t node_nodeid_write(struct node *, const char *, size_t); +static ssize_t node_ipv4_address_read(struct node *, char *); +static ssize_t node_ipv4_address_write(struct node *, const char *, size_t); +static ssize_t node_local_read(struct node *, char *); +static ssize_t node_local_write(struct node *, const char *, size_t); + +static struct config_item *make_node(struct config_group *, const char *); +static void drop_node(struct config_group *, struct config_item *); +static void release_node(struct config_item *); +static struct config_group *make_cluster(struct config_group *, const char *); +static void drop_cluster(struct config_group *, struct config_item *); +static void release_cluster(struct config_item *); + +static ssize_t show_node(struct config_item *, struct configfs_attribute *, +char *); +static ssize_t store_node(struct config_item *, struct configfs_attribute *, + const char *, size_t); + + +struct node_attribute { + struct configfs_attribute attr; + ssize_t (*show)(struct node *, char *); + ssize_t (*store)(struct node *, const char *, size_t); +}; + +static struct node_attribute
Re: [Ocfs2-devel] [RFC] nodemanager, ocfs2, dlm
On Tue, Jul 19, 2005 at 05:52:14PM +0200, Lars Marowsky-Bree wrote: The nodeid, I thought, was relative to a given DLM namespace, no? This concept seems to be missing here, or are you suggesting the nodeid to be global across namespaces? I'm not sure I understand what you mean. A node would have the same nodeid across different dlm locking-domains, assuming, of course, those dlm domains were in the context of the same cluster. The dlm only uses nodemanager to look up node addresses, though. Also, eventually we obviously need to have state for the nodes - up/down et cetera. I think the node manager also ought to track this. We don't have a need for that information yet; I'm hoping we won't ever need it in the kernel, but we'll see. How would kernel components use this and be notified about changes to the configuration / membership state? Nodemanager is perhaps a poor name; at the moment its only substantial purpose is to communicate node address/id associations in a way that's independent of a specific driver or fs. Changes to cluster configuration/membership happen in user space, of course. Those general events will have specific consequences to a given component (fs, lock manager, etc). These consequences vary quite widely depending on the component you're looking at. There are at least two ways to handle this: 1. Pass cluster events and data into the kernel (this sounds like what you're talking about above), notify the effected kernel components, each kernel component takes the cluster data and does whatever it needs to with it (internal adjustments, recovery, etc). 2. Each kernel component foo-kernel has an associated user space component foo-user. Cluster events (from userland clustering infrastructure) are passed to foo-user -- not into the kernel. foo-user determines what the specific consequences are for foo-kernel. foo-user then manipulates foo-kernel accordingly, through user/kernel hooks (sysfs, configfs, etc). These control hooks would largely be specific to foo. We're following option 2 with the dlm and gfs and have been for quite a while, which means we don't need 1. I think ocfs2 is moving that way, too. Someone could still try 1, of course, but it would be of no use or interest to me. I'm not aware of any actual projects pushing forward with something like 1, so the persistent reference to it is somewhat baffling. Dave - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [Linux-cluster] [RFC] nodemanager, ocfs2, dlm
On Tue, Jul 19, 2005 at 05:48:26PM -0700, Mark Fasheh wrote: For OCFS2 that would mean that an ocfs2_nodemanager would still exist, but as a much smaller module sitting on top of 'nodemanager'. Yep, factoring out the common bits. So no port attribute. The OCFS2 network code normally takes port from the node manager in order to determine how to talk to a given node. We'll have to figure out how to resolve that. The easiest would be to add 'port' back, but I think that might be problematic if we have multiple cluster network infrastructures as we do today. The port is specific to the component using it (ocfs2, dlm, etc), so defining port as a node property doesn't make sense if nodemanager is providing node info to multiple components. Another way to handle this would be to have userspace symlink to the node items as an attribute on an ocfs2_tcp item. We could store 'port' as a second attribute. This would have the added benefit of pinning node information while OCFS2 uses it. I expect each component will probably use another per-node configfs object for component-specific attributes, using the common bits from the nodemanager object. + charnd_name[NODEMANAGER_MAX_NAME_LEN+1]; An accessor function for this would be nice for pretty prints - maybe strcpy into a passed string. ok + int nd_nodeid; This definitely won't work with OCFS2... Nodeid (what used to be called node_num) needs to be unsigned. Otherwise this will break all our nodemap stuff which uses a bitmap to represent cluster state. ok + struct list_headnd_status_list; What are these two for? They don't seem to be referenced elsewhere... Missed ripping them out with the other ocfs-specific stuff. + if (!tmp cluster-cl_has_local + cluster-cl_local_node == node-nd_nodeid) { + cluster-cl_local_node = 0; I think we might want to be setting cl_local_node to NODEMANAGER_MAX_NODES here. It seems that ocfs2_nodemanager also does this so we might have just caught a bug you inherited :) yep You removed o2nm_configured_node_map but we need some sort of method for enumerating over the set of configured nodes. Also we need a method for querying the existence of a node. The OCFS2 code usually uses o2nm_get_node_by_num(..) != NULL for this but a simple boolean api call would be cleaner and would avoid exposing the node structure. Right, those should be on the TODO. Thanks, Dave - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: GFS2: Pull request (fixes)
On Fri, Apr 05, 2013 at 11:34:45AM +0100, Steven Whitehouse wrote: Please consider pulling the following changes, There's some mixup here that should be cleared up first. David Teigland (2): GFS2: Fix unlock of fcntl locks during withdrawn state Steven Whitehouse (1): GFS2: Fix unlock of fcntl locks during withdrawn state -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC PATCH] watchdog: Add hook for kicking in kdump path
On Wed, Apr 10, 2013 at 09:40:39AM -0400, Don Zickus wrote: However, we still have the problem that if the machine panics and we want to jump into the kdump kernel, we need to 'kick' the watchdog one more time. This provides us a sane sync point for determining how long we have to load the watchdog driver in the second kernel before the hardware reboots us. Otherwise the reboots are pretty random and nothing is guaranteed. Some time ago I submitted this patch http://www.spinics.net/lists/linux-watchdog/msg01477.html to get rid of the one extraneous ping that was causing me trouble. I'd still like to see merged, but haven't had time to follow up. I have a use case where I need to guarantee that the watchdog will *not* be pinged unless my userland daemon does the ping. If my daemon is killed, the close() generates a ping that I don't intend. This kdump ping looks like it would be another instance that I'd need to suppress. Perhaps by renaming my flag WDOG_NO_EXTRA_PING and checking it both in release and in kick_for_kdump? (My daemon associates watchdog pings with shared storage heartbeats. Based on the heartbeats, hosts in a cluster can calculate when an unresponsive host last pinged its watchdog, and can be fairly certain that the dead host has been reset by its watchdog 60 seconds later. This is used as an alternative to i/o fencing where we're protecting data on shared storage from corruption after host failures. If there are uncontrolled watchdog pings, then hosts don't know when a dead host might have last pinged its watchdog, since it is no longer based on the last timestamp it wrote to shared storage.) Dave -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 67/77] dlm: convert to idr_alloc()
On Wed, Feb 06, 2013 at 11:40:39AM -0800, Tejun Heo wrote: static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret) { struct dlm_lkb *lkb; - int rv, id; + int rv; lkb = dlm_allocate_lkb(ls); if (!lkb) @@ -1199,19 +1199,13 @@ static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret) mutex_init(lkb-lkb_cb_mutex); INIT_WORK(lkb-lkb_cb_work, dlm_callback_work); - retry: - rv = idr_pre_get(ls-ls_lkbidr, GFP_NOFS); - if (!rv) - return -ENOMEM; - + idr_preload(GFP_NOFS); spin_lock(ls-ls_lkbidr_spin); - rv = idr_get_new_above(ls-ls_lkbidr, lkb, 1, id); - if (!rv) - lkb-lkb_id = id; + rv = idr_alloc(ls-ls_lkbidr, lkb, 1, 0, GFP_NOWAIT); Hi Tejun, I'm seeing a number of new failure/warning messages within this idr_alloc. I've not seen idr_alloc itself return an error yet. Is this an expected failure where the warnings should be suppressed? Dave kworker/u:3: page allocation failure: order:1, mode:0x20 Pid: 181, comm: kworker/u:3 Not tainted 3.9.0-rc2+ #1 Call Trace: [810c870b] warn_alloc_failed+0xeb/0x150 [8105f91e] ? __wake_up+0x4e/0x70 [810ca626] __alloc_pages_nodemask+0x666/0x930 [810ca626] ? __alloc_pages_nodemask+0x666/0x930 [811031ff] kmem_getpages+0x5f/0x1b0 [81103e33] fallback_alloc+0x173/0x250 [81103be3] cache_alloc_node+0x93/0x170 [811035f8] ? cache_alloc_refill+0x2a8/0x310 [81104e59] kmem_cache_alloc+0xd9/0x130 [811da11c] idr_layer_alloc+0x2c/0x80 [811dac8c] idr_get_empty_slot+0x2ec/0x390 [811db0ad] idr_alloc+0x4d/0xc0 [a031ded2] create_lkb+0x122/0x180 [dlm] [a03232a4] receive_request+0x34/0x440 [dlm] [a0331f07] ? dlm_wait_requestqueue+0x37/0x60 [dlm] [a0326aac] _receive_message+0x67c/0x1050 [dlm] [81425f39] ? mutex_unlock+0x9/0x10 [a0327605] dlm_receive_buffer+0x185/0x200 [dlm] [a032ab7f] dlm_process_incoming_buffer+0xef/0x210 [dlm] [a032c5cc] receive_from_sock+0x1ac/0x430 [dlm] [a032aee9] process_recv_sockets+0x29/0x40 [dlm] [8104e0d7] process_one_work+0x1c7/0x460 [8104e071] ? process_one_work+0x161/0x460 [8105124d] worker_thread+0x11d/0x3e0 [81051130] ? manage_workers+0x340/0x340 [81056606] kthread+0xe6/0xf0 [81056520] ? __init_kthread_worker+0x70/0x70 [8142feec] ret_from_fork+0x7c/0xb0 [81056520] ? __init_kthread_worker+0x70/0x70 Mem-Info: Node 0 DMA per-cpu: CPU0: hi:0, btch: 1 usd: 0 CPU1: hi:0, btch: 1 usd: 0 CPU2: hi:0, btch: 1 usd: 0 CPU3: hi:0, btch: 1 usd: 0 Node 0 DMA32 per-cpu: CPU0: hi: 186, btch: 31 usd: 163 CPU1: hi: 186, btch: 31 usd: 161 CPU2: hi: 186, btch: 31 usd: 183 CPU3: hi: 186, btch: 31 usd: 53 Node 1 DMA32 per-cpu: CPU0: hi: 186, btch: 31 usd: 0 CPU1: hi: 186, btch: 31 usd: 0 CPU2: hi: 186, btch: 31 usd: 191 CPU3: hi: 186, btch: 31 usd: 166 Node 1 Normal per-cpu: CPU0: hi: 186, btch: 31 usd: 0 CPU1: hi: 186, btch: 31 usd: 32 CPU2: hi: 186, btch: 31 usd: 162 CPU3: hi: 186, btch: 31 usd: 222 active_anon:4222 inactive_anon:8075 isolated_anon:0 active_file:511976 inactive_file:334346 isolated_file:0 unevictable:7742 dirty:0 writeback:0 unstable:0 free:6682 slab_reclaimable:68508 slab_unreclaimable:62477 mapped:8263 shmem:7537 pagetables:913 bounce:0 free_cma:0 Node 0 DMA free:7912kB min:28kB low:32kB high:40kB active_anon:0kB inactive_anon:0kB active_file:7376kB inactive_file:256kB unevictable:24kB isolated(anon):0kB isolated(file):0kB present:15972kB managed:15884kB mlocked:24kB dirty:0kB writeback:0kB mapped:24kB shmem:24kB slab_reclaimable:296kB slab_unreclaimable:20kB kernel_stack:0kB pagetables:0kB unstable:0kB bounce:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no lowmem_reserve[]: 0 1971 1971 1971 Node 0 DMA32 free:7560kB min:4016kB low:5020kB high:6024kB active_anon:7868kB inactive_anon:8kB active_file:1034116kB inactive_file:727656kB unevictable:1720kB isolated(anon):0kB isolated(file):0kB present:2080768kB managed:2019104kB mlocked:1720kB dirty:0kB writeback:0kB mapped:18188kB shmem:15648kB slab_reclaimable:101792kB slab_unreclaimable:110520kB kernel_stack:792kB pagetables:1632kB unstable:0kB bounce:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no lowmem_reserve[]: 0 0 0 0 Node 1 DMA32 free:7928kB min:1952kB low:2440kB high:2928kB active_anon:44kB inactive_anon:888kB active_file:482644kB inactive_file:366988kB unevictable:26080kB isolated(anon):0kB isolated(file):0kB present:1047680kB managed:982144kB mlocked:26080kB dirty:0kB writeback:0kB mapped:4524kB shmem:4500kB slab_reclaimable:71140kB slab_unreclaimable:24332kB kernel_stack:24kB pagetables:96kB unstable:0kB
Re: [PATCH 67/77] dlm: convert to idr_alloc()
On Mon, Mar 11, 2013 at 01:28:18PM -0700, Tejun Heo wrote: Ah, right, in preloaded section, the allocation is expected to fail before falling back to the preload buffer and I forgot to add __GFP_NOWARN to the first try. Something like the following should make it go away. Can you please test it? Tested, and the warnings went away, thanks. Dave -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[GIT PULL] Revert dlm: check the maximum size of a request from user
Hi Linus, Please pull the following fix from branch: git://git.kernel.org/pub/scm/linux/kernel/git/teigland/linux-dlm.git for-linus This reverts commit 2b75bc9121e54e22537207b47b71373bcb0be41c. There is something wrong with the CONFIG_COMPAT max size check in ioctl write. There is a report of a case where this breaks userland (clvmd) when maximum resource name lengths are used. I am still sorting out exactly which combinations of kernel and userland libs are a problem. Reported-by: Jana Saout j...@saout.de CC: Sasha Levin levinsasha...@gmail.com Signed-off-by: David Teigland teigl...@redhat.com --- fs/dlm/user.c | 7 --- 1 file changed, 7 deletions(-) diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 7ff4985..eb4ed9b 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -503,13 +503,6 @@ static ssize_t device_write(struct file *file, const char __user *buf, #endif return -EINVAL; -#ifdef CONFIG_COMPAT - if (count sizeof(struct dlm_write_request32) + DLM_RESNAME_MAXLEN) -#else - if (count sizeof(struct dlm_write_request) + DLM_RESNAME_MAXLEN) -#endif - return -EINVAL; - kbuf = kzalloc(count + 1, GFP_NOFS); if (!kbuf) return -ENOMEM; -- 1.8.1.rc1.5.g7e0651a -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [GIT PULL] Revert dlm: check the maximum size of a request from user
On Mon, Feb 04, 2013 at 03:19:44PM -0500, Sasha Levin wrote: Hi David, This opens up a hole for userspace to force the kernel to allocate huge chunks of memory, triggering oom killing spree and such. It should probably be fixed instead of just reverted. I'll look into it. Here is the patch I'm planning to put in the queue for the next merge window, once it's been tested. Subject: [PATCH] dlm: check the write size from user Return EINVAL from write if the size is larger than allowed. Do this before allocating kernel memory for the bogus size, which could lead to OOM. Reported-by: Sasha Levin levinsasha...@gmail.com Signed-off-by: David Teigland teigl...@redhat.com --- fs/dlm/user.c | 7 +++ 1 file changed, 7 insertions(+) diff --git a/fs/dlm/user.c b/fs/dlm/user.c index eb4ed9b..911649a 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -503,6 +503,13 @@ static ssize_t device_write(struct file *file, const char __user *buf, #endif return -EINVAL; + /* +* can't compare against COMPAT/dlm_write_request32 because +* we don't yet know if is64bit is zero +*/ + if (count sizeof(struct dlm_write_request) + DLM_RESNAME_MAXLEN) + return -EINVAL; + kbuf = kzalloc(count + 1, GFP_NOFS); if (!kbuf) return -ENOMEM; -- 1.8.1.rc1.5.g7e0651a -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [GIT PULL] Revert dlm: check the maximum size of a request from user
On Mon, Feb 04, 2013 at 12:06:55PM -0500, David Teigland wrote: Please pull the following fix from branch: git://git.kernel.org/pub/scm/linux/kernel/git/teigland/linux-dlm.git for-linus This reverts commit 2b75bc9121e54e22537207b47b71373bcb0be41c. Hi Linus, You can choose to pull that revert, or you can alternatively pull this fix to the original patch from this branch: git://git.kernel.org/pub/scm/linux/kernel/git/teigland/linux-dlm.git fix-max-write Thanks to Jana who reported the problem and was able to test this fix so quickly. Subject: [PATCH] dlm: check the write size from user Return EINVAL from write if the size is larger than allowed. Do this before allocating kernel memory for the bogus size, which could lead to OOM. Reported-by: Sasha Levin levinsasha...@gmail.com Tested-by: Jana Saout j...@saout.de Signed-off-by: David Teigland teigl...@redhat.com --- fs/dlm/user.c | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 7ff4985..911649a 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -503,11 +503,11 @@ static ssize_t device_write(struct file *file, const char __user *buf, #endif return -EINVAL; -#ifdef CONFIG_COMPAT - if (count sizeof(struct dlm_write_request32) + DLM_RESNAME_MAXLEN) -#else + /* +* can't compare against COMPAT/dlm_write_request32 because +* we don't yet know if is64bit is zero +*/ if (count sizeof(struct dlm_write_request) + DLM_RESNAME_MAXLEN) -#endif return -EINVAL; kbuf = kzalloc(count + 1, GFP_NOFS); -- 1.8.1.rc1.5.g7e0651a -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] idr: fix a subtle bug in idr_get_next()
On Sat, Feb 02, 2013 at 03:11:35PM -0800, Tejun Heo wrote: On Sat, Feb 02, 2013 at 03:10:48PM -0800, Tejun Heo wrote: Fix it by ensuring proceeding to the next slot doesn't carry over the unaligned offset - ie. use round_up(id + 1, slot_distance) instead of id += slot_distance. Signed-off-by: Tejun Heo t...@kernel.org Reported-by: David Teigland teigl...@redhat.com Cc: KAMEZAWA Hiroyuki kamezawa.hir...@jp.fujitsu.com David, can you please test whether the patch makes the skipped deletion bug go away? Yes, I've tested, and it works fine now. Thanks, Dave -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 10/14] dlm: don't use idr_remove_all()
On Mon, Jan 28, 2013 at 10:57:23AM -0500, David Teigland wrote: On Fri, Jan 25, 2013 at 05:31:08PM -0800, Tejun Heo wrote: idr_destroy() can destroy idr by itself and idr_remove_all() is being deprecated. The conversion isn't completely trivial for recover_idr_clear() as it's the only place in kernel which makes legitimate use of idr_remove_all() w/o idr_destroy(). Replace it with idr_remove() call inside idr_for_each_entry() loop. It goes on top so that it matches the operation order in recover_idr_del(). Only compile tested. Signed-off-by: Tejun Heo t...@kernel.org Cc: Christine Caulfield ccaul...@redhat.com Cc: David Teigland teigl...@redhat.com Cc: cluster-de...@redhat.com --- This patch depends on an earlier idr patch and given the trivial nature of the patch, I think it would be best to route these together through -mm. Please holler if there's any objection. Yes, that's good for me. I'll grab the set and test the dlm bits. Hi Tejun, Unfortunately, the list_for_each_entry doesn't seem to be clearing everything. I've seen warning: recover_list_count 39 at the end of that function. Dave -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 10/14] dlm: don't use idr_remove_all()
On Tue, Jan 29, 2013 at 10:13:17AM -0500, David Teigland wrote: On Mon, Jan 28, 2013 at 10:57:23AM -0500, David Teigland wrote: On Fri, Jan 25, 2013 at 05:31:08PM -0800, Tejun Heo wrote: idr_destroy() can destroy idr by itself and idr_remove_all() is being deprecated. The conversion isn't completely trivial for recover_idr_clear() as it's the only place in kernel which makes legitimate use of idr_remove_all() w/o idr_destroy(). Replace it with idr_remove() call inside idr_for_each_entry() loop. It goes on top so that it matches the operation order in recover_idr_del(). Only compile tested. Signed-off-by: Tejun Heo t...@kernel.org Cc: Christine Caulfield ccaul...@redhat.com Cc: David Teigland teigl...@redhat.com Cc: cluster-de...@redhat.com --- This patch depends on an earlier idr patch and given the trivial nature of the patch, I think it would be best to route these together through -mm. Please holler if there's any objection. Yes, that's good for me. I'll grab the set and test the dlm bits. Hi Tejun, Unfortunately, the list_for_each_entry doesn't seem to be clearing everything. I've seen warning: recover_list_count 39 at the end of that function. I don't want to pretend to understand the internals of this idr code, but it's not clear that idr_for_each is equivalent to idr_for_each_entry when iterating through all id values. The ++id in idr_for_each_entry looks like it could lead to some missed entries? The comment about idr_get_next returning the next number to given id sounds like an entry with an id of ++id would be missed. Dave -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 10/14] dlm: don't use idr_remove_all()
On Thu, Jan 31, 2013 at 04:18:41PM -0800, Tejun Heo wrote: It looks a bit weird to me that ls-ls_recover_list_count is also incremented by recover_list_add(). The two code paths don't seem to be interlocke at least upon my very shallow glance. Is it that only either the list or idr is in use? Yes, that's correct. Anyways, can you please apply the following patch and see which IDs are leaking from the iteration? The patch too is only compile tested so I might have done something stupid but it hopefully shouldn't be too difficult to make it work. I'm trying your patch now, I don't have a test optimized to hit this code so it may take a while. static void recover_idr_clear(struct dlm_ls *ls) { struct dlm_rsb *r; @@ -358,7 +364,9 @@ static void recover_idr_clear(struct dlm spin_lock(ls-ls_recover_idr_lock); + pr_info(XXX clearing:); idr_for_each_entry(ls-ls_recover_idr, r, id) { + pr_cont( %d, id); It will often be clearing hundreds of thousands of entries, so this will probably be excessive. if (ls-ls_recover_list_count != 0) { log_error(ls, warning: recover_list_count %d, ls-ls_recover_list_count); + pr_info(XXX leftovers: ); + idr_for_each(ls-ls_recover_idr, dlm_dump_idr, NULL); + pr_cont(\n); I already tried my own version of this, but used idr_for_each_entry a second time. Unfortunately, the number it found and printed did not match recover_list_count. warning: recover_list_count 566 It printed 304 ids: 41218 41222 41223 41224 41226 41228 41229 41230 41231 41232 41234 41235 41236 41237 41239 41241 41242 41243 41244 41245 41246 41249 41252 41253 41254 41255 41256 41257 41259 41260 41261 41263 41264 41266 41271 41272 41273 41274 41277 41278 41475 41480 41483 41524 41525 41526 41655 41731 41741 41745 41749 41767 41768 41769 41772 41773 41782 42113 42114 42115 42121 42122 42124 42128 42132 42136 42138 42139 42141 42165 42375 42381 42385 42388 42390 42392 42399 42401 42404 42407 42409 42411 42416 42422 42694 42699 42712 42717 42727 42866 43009 43042 43044 43046 43049 43051 43058 43059 43064 43065 43066 43067 43330 43332 43337 43338 43339 43343 43348 43349 43351 43354 43355 43356 43361 43362 43368 43369 43370 43375 43376 43377 43378 43379 43381 43575 43576 43577 43677 43678 43680 43683 43684 43685 43689 43690 43819 43820 43823 43824 43825 43826 43827 43828 43829 43831 43905 43907 43908 43912 43929 43930 43955 43956 43960 43962 43965 44288 44289 44291 44296 44298 44300 44310 44311 44313 44314 44316 44318 44321 44323 44325 44454 44456 44457 44458 44544 44547 44548 44550 44555 44557 44560 44562 44564 44567 44573 44575 44576 44578 44579 44581 44582 44583 44584 44585 44589 44592 44595 44596 44726 44728 44729 44732 44734 44866 44867 44873 44876 44878 44879 44912 44914 44916 44920 44923 44924 45053 45186 45189 45190 45195 45197 45199 45200 45201 45203 45204 45208 45209 45212 45213 45216 45220 45223 45224 45225 45227 45228 45231 45234 45440 45441 45444 45448 45450 45452 45454 45456 45457 45458 45459 45460 45461 45464 45466 45467 45472 45475 45477 45484 45485 45488 45492 45494 45495 45496 45497 45498 45499 45628 45630 45698 45699 45700 45703 45707 45708 45710 45713 45715 45717 45720 45722 45723 45724 45725 45727 45729 45730 45731 45733 45734 45737 45739 45741 45742 45746 45748 45750 45755 47292 47293 47294 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 09/14] dlm: use idr_for_each_entry() in recover_idr_clear() error path
On Fri, Jan 25, 2013 at 05:31:07PM -0800, Tejun Heo wrote: Convert recover_idr_clear() to use idr_for_each_entry() instead of idr_for_each(). It's somewhat less efficient this way but it shouldn't matter in an error path. This is to help with deprecation of idr_remove_all(). Only compile tested. Signed-off-by: Tejun Heo t...@kernel.org Cc: Christine Caulfield ccaul...@redhat.com Cc: David Teigland teigl...@redhat.com Cc: cluster-de...@redhat.com --- This patch depends on an earlier idr patch and I think it would be best to route these together through -mm. Christine, David, can you please ack this? Ack -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 10/14] dlm: don't use idr_remove_all()
On Fri, Jan 25, 2013 at 05:31:08PM -0800, Tejun Heo wrote: idr_destroy() can destroy idr by itself and idr_remove_all() is being deprecated. The conversion isn't completely trivial for recover_idr_clear() as it's the only place in kernel which makes legitimate use of idr_remove_all() w/o idr_destroy(). Replace it with idr_remove() call inside idr_for_each_entry() loop. It goes on top so that it matches the operation order in recover_idr_del(). Only compile tested. Signed-off-by: Tejun Heo t...@kernel.org Cc: Christine Caulfield ccaul...@redhat.com Cc: David Teigland teigl...@redhat.com Cc: cluster-de...@redhat.com --- This patch depends on an earlier idr patch and given the trivial nature of the patch, I think it would be best to route these together through -mm. Please holler if there's any objection. Yes, that's good for me. I'll grab the set and test the dlm bits. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[GIT PULL] dlm updates for 3.9
Hi Linus, Please pull dlm updates from tag: git://git.kernel.org/pub/scm/linux/kernel/git/teigland/linux-dlm.git dlm-3.9 This includes a single patch to avoid excessive and unnecessary scanning of rsbs to free. Patch copied below. Thanks, Dave dlm: avoid scanning unchanged toss lists Keep track of whether a toss list contains any shrinkable rsbs. If not, dlm_scand can avoid scanning the list for rsbs to shrink. Unnecessary scanning can otherwise waste a lot of time because the toss lists can contain a large number of rsbs that are non-shrinkable (directory records). Signed-off-by: David Teigland teigl...@redhat.com --- fs/dlm/dlm_internal.h | 3 +++ fs/dlm/lock.c | 15 +++ 2 files changed, 18 insertions(+) diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 77c0f70..e7665c3 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -96,10 +96,13 @@ do { \ } +#define DLM_RTF_SHRINK 0x0001 + struct dlm_rsbtable { struct rb_root keep; struct rb_root toss; spinlock_t lock; + uint32_tflags; }; diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index a579f30..f750165 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -1132,6 +1132,7 @@ static void toss_rsb(struct kref *kref) rb_erase(r-res_hashnode, ls-ls_rsbtbl[r-res_bucket].keep); rsb_insert(r, ls-ls_rsbtbl[r-res_bucket].toss); r-res_toss_time = jiffies; + ls-ls_rsbtbl[r-res_bucket].flags |= DLM_RTF_SHRINK; if (r-res_lvbptr) { dlm_free_lvb(r-res_lvbptr); r-res_lvbptr = NULL; @@ -1659,11 +1660,18 @@ static void shrink_bucket(struct dlm_ls *ls, int b) char *name; int our_nodeid = dlm_our_nodeid(); int remote_count = 0; + int need_shrink = 0; int i, len, rv; memset(ls-ls_remove_lens, 0, sizeof(int) * DLM_REMOVE_NAMES_MAX); spin_lock(ls-ls_rsbtbl[b].lock); + + if (!(ls-ls_rsbtbl[b].flags DLM_RTF_SHRINK)) { + spin_unlock(ls-ls_rsbtbl[b].lock); + return; + } + for (n = rb_first(ls-ls_rsbtbl[b].toss); n; n = next) { next = rb_next(n); r = rb_entry(n, struct dlm_rsb, res_hashnode); @@ -1679,6 +1687,8 @@ static void shrink_bucket(struct dlm_ls *ls, int b) continue; } + need_shrink = 1; + if (!time_after_eq(jiffies, r-res_toss_time + dlm_config.ci_toss_secs * HZ)) { continue; @@ -1710,6 +1720,11 @@ static void shrink_bucket(struct dlm_ls *ls, int b) rb_erase(r-res_hashnode, ls-ls_rsbtbl[b].toss); dlm_free_rsb(r); } + + if (need_shrink) + ls-ls_rsbtbl[b].flags |= DLM_RTF_SHRINK; + else + ls-ls_rsbtbl[b].flags = ~DLM_RTF_SHRINK; spin_unlock(ls-ls_rsbtbl[b].lock); /* -- 1.8.1.rc1.5.g7e0651a -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
GFS, what's remaining
Hi, this is the latest set of gfs patches, it includes some minor munging since the previous set. Andrew, could this be added to -mm? there's not much in the way of pending changes. http://redhat.com/~teigland/gfs2/20050901/gfs2-full.patch http://redhat.com/~teigland/gfs2/20050901/broken-out/ I'd like to get a list of specific things remaining for merging. I believe we've responded to everything from earlier reviews, they were very helpful and more would be excellent. The list begins with one item from before that's still pending: - Adapt the vfs so gfs (and other cfs's) don't need to walk vma lists. [cf. ops_file.c:walk_vm(), gfs works fine as is, but some don't like it.] ... Thanks Dave - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 12/13] GFS: lock_nolock module
The lock_nolock module does no inter-node locking and allows gfs to be used as a local file system. Signed-off-by: Ken Preslan [EMAIL PROTECTED] Signed-off-by: David Teigland [EMAIL PROTECTED] --- fs/gfs2/locking/nolock/Makefile |3 fs/gfs2/locking/nolock/main.c | 267 2 files changed, 270 insertions(+) diff -urpN a/fs/gfs2/locking/nolock/Makefile b/fs/gfs2/locking/nolock/Makefile --- a/fs/gfs2/locking/nolock/Makefile 1970-01-01 07:30:00.0 +0730 +++ b/fs/gfs2/locking/nolock/Makefile 2005-09-01 17:23:56.963442912 +0800 @@ -0,0 +1,3 @@ +obj-$(CONFIG_GFS2_FS) += lock_nolock.o +lock_nolock-y := main.o + diff -urpN a/fs/gfs2/locking/nolock/main.c b/fs/gfs2/locking/nolock/main.c --- a/fs/gfs2/locking/nolock/main.c 1970-01-01 07:30:00.0 +0730 +++ b/fs/gfs2/locking/nolock/main.c 2005-09-01 17:23:56.952444584 +0800 @@ -0,0 +1,267 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#include linux/module.h +#include linux/slab.h +#include linux/module.h +#include linux/init.h +#include linux/types.h +#include linux/fs.h +#include linux/smp_lock.h + +#include ../harness/lm_interface.h + +struct nolock_lockspace { + unsigned int nl_lvb_size; +}; + +struct lm_lockops nolock_ops; + +static int nolock_mount(char *table_name, char *host_data, + lm_callback_t cb, lm_fsdata_t *fsdata, + unsigned int min_lvb_size, int flags, + struct lm_lockstruct *lockstruct) +{ + char *c; + unsigned int jid; + struct nolock_lockspace *nl; + + /* If there is a jid= in the hostdata, return that jid. + Otherwise, return zero. */ + + c = strstr(host_data, jid=); + if (!c) + jid = 0; + else { + c += 4; + sscanf(c, %u, jid); + } + + nl = kmalloc(sizeof(struct nolock_lockspace), GFP_KERNEL); + if (!nl) + return -ENOMEM; + + memset(nl, 0, sizeof(struct nolock_lockspace)); + nl-nl_lvb_size = min_lvb_size; + + lockstruct-ls_jid = jid; + lockstruct-ls_first = 1; + lockstruct-ls_lvb_size = min_lvb_size; + lockstruct-ls_lockspace = (lm_lockspace_t *)nl; + lockstruct-ls_ops = nolock_ops; + lockstruct-ls_flags = LM_LSFLAG_LOCAL; + + return 0; +} + +static void nolock_others_may_mount(lm_lockspace_t *lockspace) +{ +} + +static void nolock_unmount(lm_lockspace_t *lockspace) +{ + struct nolock_lockspace *nl = (struct nolock_lockspace *)lockspace; + kfree(nl); +} + +static void nolock_withdraw(lm_lockspace_t *lockspace) +{ +} + +/** + * nolock_get_lock - get a lm_lock_t given a descripton of the lock + * @lockspace: the lockspace the lock lives in + * @name: the name of the lock + * @lockp: return the lm_lock_t here + * + * Returns: 0 on success, -EXXX on failure + */ + +static int nolock_get_lock(lm_lockspace_t *lockspace, struct lm_lockname *name, + lm_lock_t **lockp) +{ + *lockp = (lm_lock_t *)lockspace; + return 0; +} + +/** + * nolock_put_lock - get rid of a lock structure + * @lock: the lock to throw away + * + */ + +static void nolock_put_lock(lm_lock_t *lock) +{ +} + +/** + * nolock_lock - acquire a lock + * @lock: the lock to manipulate + * @cur_state: the current state + * @req_state: the requested state + * @flags: modifier flags + * + * Returns: A bitmap of LM_OUT_* + */ + +static unsigned int nolock_lock(lm_lock_t *lock, unsigned int cur_state, + unsigned int req_state, unsigned int flags) +{ + return req_state | LM_OUT_CACHEABLE; +} + +/** + * nolock_unlock - unlock a lock + * @lock: the lock to manipulate + * @cur_state: the current state + * + * Returns: 0 + */ + +static unsigned int nolock_unlock(lm_lock_t *lock, unsigned int cur_state) +{ + return 0; +} + +static void nolock_cancel(lm_lock_t *lock) +{ +} + +/** + * nolock_hold_lvb - hold on to a lock value block + * @lock: the lock the LVB is associated with + * @lvbp: return the lm_lvb_t here + * + * Returns: 0 on success, -EXXX on failure + */ + +static int nolock_hold_lvb(lm_lock_t *lock, char **lvbp) +{ + struct nolock_lockspace *nl = (struct nolock_lockspace *)lock; + int error = 0; + + *lvbp = kmalloc(nl-nl_lvb_size, GFP_KERNEL); + if (*lvbp) + memset(*lvbp, 0, nl-nl_lvb_size); + else + error = -ENOMEM; + + return error; +} + +/** + * nolock_unhold_lvb - release a LVB + * @lock: the lock the LVB is associated with + * @lvb: the lock value block + * + */ + +static void nolock_unhold_lvb(lm_lock_t *lock, char *lvb
[PATCH 13/13] GFS: lock_dlm module
The lock_dlm module uses the DLM in linux/drivers/dlm/ for inter-node locking. Signed-off-by: Ken Preslan [EMAIL PROTECTED] Signed-off-by: David Teigland [EMAIL PROTECTED] --- fs/gfs2/locking/dlm/Makefile |3 fs/gfs2/locking/dlm/lock.c | 533 + fs/gfs2/locking/dlm/lock_dlm.h | 200 +++ fs/gfs2/locking/dlm/main.c | 62 fs/gfs2/locking/dlm/mount.c| 218 fs/gfs2/locking/dlm/plock.c| 274 + fs/gfs2/locking/dlm/sysfs.c| 283 + fs/gfs2/locking/dlm/thread.c | 355 +++ include/linux/lock_dlm_plock.h | 40 +++ 9 files changed, 1968 insertions(+) diff -urpN a/fs/gfs2/locking/dlm/Makefile b/fs/gfs2/locking/dlm/Makefile --- a/fs/gfs2/locking/dlm/Makefile 1970-01-01 07:30:00.0 +0730 +++ b/fs/gfs2/locking/dlm/Makefile 2005-09-01 17:48:48.143749048 +0800 @@ -0,0 +1,3 @@ +obj-$(CONFIG_GFS2_FS) += lock_dlm.o +lock_dlm-y := lock.o main.o mount.o sysfs.o thread.o plock.o + diff -urpN a/fs/gfs2/locking/dlm/lock.c b/fs/gfs2/locking/dlm/lock.c --- a/fs/gfs2/locking/dlm/lock.c1970-01-01 07:30:00.0 +0730 +++ b/fs/gfs2/locking/dlm/lock.c2005-09-01 17:48:48.139749656 +0800 @@ -0,0 +1,533 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#include lock_dlm.h + +static char junk_lvb[GDLM_LVB_SIZE]; + +static void queue_complete(struct gdlm_lock *lp) +{ + struct gdlm_ls *ls = lp-ls; + + clear_bit(LFL_ACTIVE, lp-flags); + + spin_lock(ls-async_lock); + list_add_tail(lp-clist, ls-complete); + spin_unlock(ls-async_lock); + wake_up(ls-thread_wait); +} + +static inline void gdlm_ast(void *astarg) +{ + queue_complete((struct gdlm_lock *) astarg); +} + +static inline void gdlm_bast(void *astarg, int mode) +{ + struct gdlm_lock *lp = astarg; + struct gdlm_ls *ls = lp-ls; + + if (!mode) { + printk(lock_dlm: bast mode zero %x,%PRIx64\n, + lp-lockname.ln_type, lp-lockname.ln_number); + return; + } + + spin_lock(ls-async_lock); + if (!lp-bast_mode) { + list_add_tail(lp-blist, ls-blocking); + lp-bast_mode = mode; + } else if (lp-bast_mode mode) + lp-bast_mode = mode; + spin_unlock(ls-async_lock); + wake_up(ls-thread_wait); +} + +void gdlm_queue_delayed(struct gdlm_lock *lp) +{ + struct gdlm_ls *ls = lp-ls; + + spin_lock(ls-async_lock); + list_add_tail(lp-delay_list, ls-delayed); + spin_unlock(ls-async_lock); +} + +/* convert gfs lock-state to dlm lock-mode */ + +static int16_t make_mode(int16_t lmstate) +{ + switch (lmstate) { + case LM_ST_UNLOCKED: + return DLM_LOCK_NL; + case LM_ST_EXCLUSIVE: + return DLM_LOCK_EX; + case LM_ST_DEFERRED: + return DLM_LOCK_CW; + case LM_ST_SHARED: + return DLM_LOCK_PR; + default: + GDLM_ASSERT(0, printk(unknown LM state %d\n, lmstate);); + } +} + +/* convert dlm lock-mode to gfs lock-state */ + +int16_t gdlm_make_lmstate(int16_t dlmmode) +{ + switch (dlmmode) { + case DLM_LOCK_IV: + case DLM_LOCK_NL: + return LM_ST_UNLOCKED; + case DLM_LOCK_EX: + return LM_ST_EXCLUSIVE; + case DLM_LOCK_CW: + return LM_ST_DEFERRED; + case DLM_LOCK_PR: + return LM_ST_SHARED; + default: + GDLM_ASSERT(0, printk(unknown DLM mode %d\n, dlmmode);); + } +} + +/* verify agreement with GFS on the current lock state, NB: DLM_LOCK_NL and + DLM_LOCK_IV are both considered LM_ST_UNLOCKED by GFS. */ + +static void check_cur_state(struct gdlm_lock *lp, unsigned int cur_state) +{ + int16_t cur = make_mode(cur_state); + if (lp-cur != DLM_LOCK_IV) + GDLM_ASSERT(lp-cur == cur, printk(%d, %d\n, lp-cur, cur);); +} + +static inline unsigned int make_flags(struct gdlm_lock *lp, + unsigned int gfs_flags, + int16_t cur, int16_t req) +{ + unsigned int lkf = 0; + + if (gfs_flags LM_FLAG_TRY) + lkf |= DLM_LKF_NOQUEUE; + + if (gfs_flags LM_FLAG_TRY_1CB) { + lkf |= DLM_LKF_NOQUEUE; + lkf |= DLM_LKF_NOQUEUEBAST; + } + + if (gfs_flags LM_FLAG_PRIORITY) { + lkf |= DLM_LKF_NOORDER; + lkf |= DLM_LKF_HEADQUE; + } + + if (gfs_flags LM_FLAG_ANY) { + if (req == DLM_LOCK_PR) + lkf
[PATCH 11/13] GFS: lock_harness module
The lock_harness module allows a gfs file system to connect to a given lock module. Signed-off-by: Ken Preslan [EMAIL PROTECTED] Signed-off-by: David Teigland [EMAIL PROTECTED] --- fs/gfs2/locking/harness/Makefile |3 fs/gfs2/locking/harness/lm_interface.h | 286 + fs/gfs2/locking/harness/main.c | 206 +++ 3 files changed, 495 insertions(+) diff -urpN a/fs/gfs2/locking/harness/Makefile b/fs/gfs2/locking/harness/Makefile --- a/fs/gfs2/locking/harness/Makefile 1970-01-01 07:30:00.0 +0730 +++ b/fs/gfs2/locking/harness/Makefile 2005-09-01 17:23:36.150606944 +0800 @@ -0,0 +1,3 @@ +obj-$(CONFIG_GFS2_FS) += lock_harness.o +lock_harness-y := main.o + diff -urpN a/fs/gfs2/locking/harness/lm_interface.h b/fs/gfs2/locking/harness/lm_interface.h --- a/fs/gfs2/locking/harness/lm_interface.h1970-01-01 07:30:00.0 +0730 +++ b/fs/gfs2/locking/harness/lm_interface.h2005-09-01 17:23:36.119611656 +0800 @@ -0,0 +1,286 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#ifndef __LM_INTERFACE_DOT_H__ +#define __LM_INTERFACE_DOT_H__ + +/* + * Opaque handles represent the lock module's lockspace structure, the lock + * module's lock structures, and GFS's file system (superblock) structure. + */ + +typedef void lm_lockspace_t; +typedef void lm_lock_t; +typedef void lm_fsdata_t; + +typedef void (*lm_callback_t) (lm_fsdata_t *fsdata, unsigned int type, + void *data); + +/* + * lm_mount() flags + * + * LM_MFLAG_SPECTATOR + * GFS is asking to join the filesystem's lockspace, but it doesn't want to + * modify the filesystem. The lock module shouldn't assign a journal to the FS + * mount. It shouldn't send recovery callbacks to the FS mount. If the node + * dies or withdraws, all locks can be wiped immediately. + */ + +#define LM_MFLAG_SPECTATOR 0x0001 + +/* + * lm_lockstruct flags + * + * LM_LSFLAG_LOCAL + * The lock_nolock module returns LM_LSFLAG_LOCAL to GFS, indicating that GFS + * can make single-node optimizations. + */ + +#define LM_LSFLAG_LOCAL0x0001 + +/* + * lm_lockname types + */ + +#define LM_TYPE_RESERVED 0x00 +#define LM_TYPE_NONDISK0x01 +#define LM_TYPE_INODE 0x02 +#define LM_TYPE_RGRP 0x03 +#define LM_TYPE_META 0x04 +#define LM_TYPE_IOPEN 0x05 +#define LM_TYPE_FLOCK 0x06 +#define LM_TYPE_PLOCK 0x07 +#define LM_TYPE_QUOTA 0x08 +#define LM_TYPE_JOURNAL0x09 + +/* + * lm_lock() states + * + * SHARED is compatible with SHARED, not with DEFERRED or EX. + * DEFERRED is compatible with DEFERRED, not with SHARED or EX. + */ + +#define LM_ST_UNLOCKED 0 +#define LM_ST_EXCLUSIVE1 +#define LM_ST_DEFERRED 2 +#define LM_ST_SHARED 3 + +/* + * lm_lock() flags + * + * LM_FLAG_TRY + * Don't wait to acquire the lock if it can't be granted immediately. + * + * LM_FLAG_TRY_1CB + * Send one blocking callback if TRY is set and the lock is not granted. + * + * LM_FLAG_NOEXP + * GFS sets this flag on lock requests it makes while doing journal recovery. + * These special requests should not be blocked due to the recovery like + * ordinary locks would be. + * + * LM_FLAG_ANY + * A SHARED request may also be granted in DEFERRED, or a DEFERRED request may + * also be granted in SHARED. The preferred state is whichever is compatible + * with other granted locks, or the specified state if no other locks exist. + * + * LM_FLAG_PRIORITY + * Override fairness considerations. Suppose a lock is held in a shared state + * and there is a pending request for the deferred state. A shared lock + * request with the priority flag would be allowed to bypass the deferred + * request and directly join the other shared lock. A shared lock request + * without the priority flag might be forced to wait until the deferred + * requested had acquired and released the lock. + */ + +#define LM_FLAG_TRY0x0001 +#define LM_FLAG_TRY_1CB0x0002 +#define LM_FLAG_NOEXP 0x0004 +#define LM_FLAG_ANY0x0008 +#define LM_FLAG_PRIORITY 0x0010 + +/* + * lm_lock() and lm_async_cb return flags + * + * LM_OUT_ST_MASK + * Masks the lower two bits of lock state in the returned value. + * + * LM_OUT_CACHEABLE + * The lock hasn't been released so GFS can continue to cache data for it. + * + * LM_OUT_CANCELED + * The lock request was canceled. + * + * LM_OUT_ASYNC + * The result of the request will be returned in an LM_CB_ASYNC callback. + */ + +#define LM_OUT_ST_MASK 0x0003 +#define LM_OUT_CACHEABLE
[PATCH 06/13] GFS: logging and recovery
A per-node on-disk log is used for recovery. Signed-off-by: Ken Preslan [EMAIL PROTECTED] Signed-off-by: David Teigland [EMAIL PROTECTED] --- fs/gfs2/log.c | 670 + fs/gfs2/log.h | 68 + fs/gfs2/recovery.c | 561 fs/gfs2/recovery.h | 32 ++ 4 files changed, 1331 insertions(+) --- a/fs/gfs2/log.c 1970-01-01 07:30:00.0 +0730 +++ b/fs/gfs2/log.c 2005-09-01 17:36:55.338111976 +0800 @@ -0,0 +1,670 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#include linux/sched.h +#include linux/slab.h +#include linux/smp_lock.h +#include linux/spinlock.h +#include linux/completion.h +#include linux/buffer_head.h +#include asm/semaphore.h + +#include gfs2.h +#include bmap.h +#include glock.h +#include log.h +#include lops.h +#include meta_io.h + +#define PULL 1 + +static inline int is_done(struct gfs2_sbd *sdp, atomic_t *a) +{ + int done; + gfs2_log_lock(sdp); + done = atomic_read(a) ? FALSE : TRUE; + gfs2_log_unlock(sdp); + return done; +} + +static void do_lock_wait(struct gfs2_sbd *sdp, wait_queue_head_t *wq, +atomic_t *a) +{ + gfs2_log_unlock(sdp); + wait_event(*wq, is_done(sdp, a)); + gfs2_log_lock(sdp); +} + +static void lock_for_trans(struct gfs2_sbd *sdp) +{ + gfs2_log_lock(sdp); + do_lock_wait(sdp, sdp-sd_log_trans_wq, sdp-sd_log_flush_count); + atomic_inc(sdp-sd_log_trans_count); + gfs2_log_unlock(sdp); +} + +static void unlock_from_trans(struct gfs2_sbd *sdp) +{ + gfs2_assert_warn(sdp, atomic_read(sdp-sd_log_trans_count)); + if (atomic_dec_and_test(sdp-sd_log_trans_count)) + wake_up(sdp-sd_log_flush_wq); +} + +void gfs2_lock_for_flush(struct gfs2_sbd *sdp) +{ + gfs2_log_lock(sdp); + atomic_inc(sdp-sd_log_flush_count); + do_lock_wait(sdp, sdp-sd_log_flush_wq, sdp-sd_log_trans_count); + gfs2_log_unlock(sdp); +} + +void gfs2_unlock_from_flush(struct gfs2_sbd *sdp) +{ + gfs2_assert_warn(sdp, atomic_read(sdp-sd_log_flush_count)); + if (atomic_dec_and_test(sdp-sd_log_flush_count)) + wake_up(sdp-sd_log_trans_wq); +} + +/** + * gfs2_struct2blk - compute stuff + * @sdp: the filesystem + * @nstruct: the number of structures + * @ssize: the size of the structures + * + * Compute the number of log descriptor blocks needed to hold a certain number + * of structures of a certain size. + * + * Returns: the number of blocks needed (minimum is always 1) + */ + +unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct, +unsigned int ssize) +{ + unsigned int blks; + unsigned int first, second; + + blks = 1; + first = (sdp-sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / ssize; + + if (nstruct first) { + second = (sdp-sd_sb.sb_bsize - sizeof(struct gfs2_meta_header)) / ssize; + blks += DIV_RU(nstruct - first, second); + } + + return blks; +} + +void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags) +{ + struct list_head *head = sdp-sd_ail1_list; + uint64_t sync_gen; + struct list_head *first, *tmp; + struct gfs2_ail *first_ai, *ai; + + gfs2_log_lock(sdp); + if (list_empty(head)) { + gfs2_log_unlock(sdp); + return; + } + sync_gen = sdp-sd_ail_sync_gen++; + + first = head-prev; + first_ai = list_entry(first, struct gfs2_ail, ai_list); + first_ai-ai_sync_gen = sync_gen; + gfs2_ail1_start_one(sdp, first_ai); + + if (flags DIO_ALL) + first = NULL; + + for (;;) { + if (first + (head-prev != first || +gfs2_ail1_empty_one(sdp, first_ai, 0))) + break; + + for (tmp = head-prev; tmp != head; tmp = tmp-prev) { + ai = list_entry(tmp, struct gfs2_ail, ai_list); + if (ai-ai_sync_gen = sync_gen) + continue; + ai-ai_sync_gen = sync_gen; + gfs2_ail1_start_one(sdp, ai); + break; + } + + if (tmp == head) + break; + } + + gfs2_log_unlock(sdp); +} + +int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags) +{ + struct list_head *head, *tmp, *prev; + struct gfs2_ail *ai; + int ret; + + gfs2_log_lock(sdp); + + for (head = sdp-sd_ail1_list, tmp = head-prev, prev = tmp-prev; +tmp != head
[PATCH 10/13] GFS: build and documentation
Add gfs to the build system and gfs2.txt to Documentation. Signed-off-by: Ken Preslan [EMAIL PROTECTED] Signed-off-by: David Teigland [EMAIL PROTECTED] --- Documentation/filesystems/gfs2.txt | 194 + fs/Kconfig | 15 ++ fs/Makefile|1 fs/gfs2/Makefile | 45 4 files changed, 255 insertions(+) --- a/fs/gfs2/Makefile 1970-01-01 07:30:00.0 +0730 +++ b/fs/gfs2/Makefile 2005-09-01 17:36:55.572076408 +0800 @@ -0,0 +1,45 @@ +obj-$(CONFIG_GFS2_FS) += gfs2.o +gfs2-y := \ + acl.o \ + bits.o \ + bmap.o \ + daemon.o \ + dir.o \ + eaops.o \ + eattr.o \ + glock.o \ + glops.o \ + inode.o \ + ioctl.o \ + jdata.o \ + lm.o \ + log.o \ + lops.o \ + lvb.o \ + main.o \ + meta_io.o \ + mount.o \ + ondisk.o \ + ops_address.o \ + ops_dentry.o \ + ops_export.o \ + ops_file.o \ + ops_fstype.o \ + ops_inode.o \ + ops_super.o \ + ops_vm.o \ + page.o \ + quota.o \ + resize.o \ + recovery.o \ + rgrp.o \ + super.o \ + sys.o \ + trans.o \ + unlinked.o \ + util.o + +obj-$(CONFIG_GFS2_FS) += locking/harness/ +obj-$(CONFIG_GFS2_FS) += locking/nolock/ +obj-$(CONFIG_GFS2_FS) += locking/dlm/ + --- a/fs/Makefile 2005-09-01 16:59:28.042752800 +0800 +++ b/fs/Makefile 2005-09-01 17:10:11.211976216 +0800 @@ -105,3 +105,4 @@ obj-$(CONFIG_OCFS2_FS) += ocfs2/ obj-$(CONFIG_RELAYFS_FS) += relayfs/ obj-$(CONFIG_9P_FS)+= 9p/ +obj-$(CONFIG_GFS2_FS) += gfs2/ --- a/fs/Kconfig2005-09-01 16:59:28.038753408 +0800 +++ b/fs/Kconfig2005-09-01 17:09:39.810749928 +0800 @@ -360,6 +360,21 @@ - POSIX ACLs - readpages / writepages (not user visible) +config GFS2_FS + tristate GFS2 file system support + depends on DLM + select FS_POSIX_ACL + help + A cluster filesystem. + + Allows a cluster of computers to simultaneously use a block device + that is shared between them (with FC, iSCSI, NBD, etc...). GFS reads + and writes to the block device like a local filesystem, but also uses + a lock module to allow the computers coordinate their I/O so + filesystem consistency is maintained. One of the nifty features of + GFS is perfect consistency -- changes made to the filesystem on one + machine show up immediately on all other machines in the cluster. + config MINIX_FS tristate Minix fs support help --- a/Documentation/filesystems/gfs2.txt1970-01-01 07:30:00.0 +0730 +++ b/Documentation/filesystems/gfs2.txt2005-09-01 17:36:55.593073216 +0800 @@ -0,0 +1,194 @@ +Global File System +-- + +http://sources.redhat.com/cluster/ + +GFS is a cluster file system. It allows a cluster of computers to +simultaneously use a block device that is shared between them (with FC, +iSCSI, NBD, etc). GFS reads and writes to the block device like a local +file system, but also uses a lock module to allow the computers coordinate +their I/O so file system consistency is maintained. One of the nifty +features of GFS is perfect consistency -- changes made to the file system +on one machine show up immediately on all other machines in the cluster. + +GFS uses interchangable inter-node locking mechanisms. GFS plugs into one +side of a module called lock_harness and different lock modules can plug +into the other side of the harness. Each gfs file system selects the +appropriate lock module at mount time. Lock modules include: + + lock_nolock -- does no real locking and allows gfs to be used as a + local file system + + lock_dlm -- uses a distributed lock manager (dlm) for inter-node locking + The dlm is found at linux/drivers/dlm/ + +In addition to interfacing with an external locking manager, a gfs lock +module is responsible for interacting with external cluster management +systems. Lock_dlm depends on user space cluster management systems found +at the location above. + +To use gfs as a local file system, no external clustering systems are +needed, simply: + + $ gfs2_mkfs -p lock_nolock -j 1 /dev/block_device + $ mount -t gfs2 /dev/block_device /dir + +GFS2 is not on-disk compatible with previous versions of GFS. + + +The following man pages can be found at the location above: + gfs2_mkfsto make a filesystem + gfs2_fsckto repair a filesystem + gfs2_growto expand a filesystem online + gfs2_jaddto add journals to a filesystem online + gfs2_toolto manipulate, examine and tune a filesystem + gfs2_quota to examine and change quota values in a filesystem + gfs2_mount to find mount options + +Mount options (from the gfs2_mount man page) + + lockproto=LockModuleName
[PATCH 07/13] GFS: quotas
Code that deals with quotas. Signed-off-by: Ken Preslan [EMAIL PROTECTED] Signed-off-by: David Teigland [EMAIL PROTECTED] --- fs/gfs2/lvb.c | 61 ++ fs/gfs2/lvb.h | 28 + fs/gfs2/quota.c | 1209 fs/gfs2/quota.h | 34 + 4 files changed, 1332 insertions(+) --- a/fs/gfs2/quota.c 1970-01-01 07:30:00.0 +0730 +++ b/fs/gfs2/quota.c 2005-09-01 17:36:55.443096016 +0800 @@ -0,0 +1,1209 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#include linux/sched.h +#include linux/slab.h +#include linux/smp_lock.h +#include linux/spinlock.h +#include linux/completion.h +#include linux/buffer_head.h +#include linux/tty.h +#include linux/sort.h +#include asm/semaphore.h + +#include gfs2.h +#include bmap.h +#include glock.h +#include glops.h +#include jdata.h +#include log.h +#include meta_io.h +#include quota.h +#include rgrp.h +#include super.h +#include trans.h + +#define QUOTA_USER 1 +#define QUOTA_GROUP 0 + +static uint64_t qd2offset(struct gfs2_quota_data *qd) +{ + uint64_t offset; + + offset = 2 * (uint64_t)qd-qd_id + !test_bit(QDF_USER, qd-qd_flags); + offset *= sizeof(struct gfs2_quota); + + return offset; +} + +static int qd_alloc(struct gfs2_sbd *sdp, int user, uint32_t id, + struct gfs2_quota_data **qdp) +{ + struct gfs2_quota_data *qd; + int error; + + qd = kzalloc(sizeof(struct gfs2_quota_data), GFP_KERNEL); + if (!qd) + return -ENOMEM; + + qd-qd_count = 1; + qd-qd_id = id; + if (user) + set_bit(QDF_USER, qd-qd_flags); + qd-qd_slot = -1; + + error = gfs2_glock_get(sdp, 2 * (uint64_t)id + !user, + gfs2_quota_glops, CREATE, qd-qd_gl); + if (error) + goto fail; + + error = gfs2_lvb_hold(qd-qd_gl); + gfs2_glock_put(qd-qd_gl); + if (error) + goto fail; + + *qdp = qd; + + return 0; + + fail: + kfree(qd); + return error; +} + +static int qd_get(struct gfs2_sbd *sdp, int user, uint32_t id, int create, + struct gfs2_quota_data **qdp) +{ + struct gfs2_quota_data *qd = NULL, *new_qd = NULL; + int error, found; + + *qdp = NULL; + + for (;;) { + found = FALSE; + spin_lock(sdp-sd_quota_spin); + list_for_each_entry(qd, sdp-sd_quota_list, qd_list) { + if (qd-qd_id == id + !test_bit(QDF_USER, qd-qd_flags) == !user) { + qd-qd_count++; + found = TRUE; + break; + } + } + + if (!found) + qd = NULL; + + if (!qd new_qd) { + qd = new_qd; + list_add(qd-qd_list, sdp-sd_quota_list); + atomic_inc(sdp-sd_quota_count); + new_qd = NULL; + } + + spin_unlock(sdp-sd_quota_spin); + + if (qd || !create) { + if (new_qd) { + gfs2_lvb_unhold(new_qd-qd_gl); + kfree(new_qd); + } + *qdp = qd; + return 0; + } + + error = qd_alloc(sdp, user, id, new_qd); + if (error) + return error; + } +} + +static void qd_hold(struct gfs2_quota_data *qd) +{ + struct gfs2_sbd *sdp = qd-qd_gl-gl_sbd; + + spin_lock(sdp-sd_quota_spin); + gfs2_assert(sdp, qd-qd_count,); + qd-qd_count++; + spin_unlock(sdp-sd_quota_spin); +} + +static void qd_put(struct gfs2_quota_data *qd) +{ + struct gfs2_sbd *sdp = qd-qd_gl-gl_sbd; + spin_lock(sdp-sd_quota_spin); + gfs2_assert(sdp, qd-qd_count,); + if (!--qd-qd_count) + qd-qd_last_touched = jiffies; + spin_unlock(sdp-sd_quota_spin); +} + +static int slot_get(struct gfs2_quota_data *qd) +{ + struct gfs2_sbd *sdp = qd-qd_gl-gl_sbd; + unsigned int c, o = 0, b; + unsigned char byte = 0; + + spin_lock(sdp-sd_quota_spin); + + if (qd-qd_slot_count++) { + spin_unlock(sdp-sd_quota_spin); + return 0; + } + + for (c = 0; c sdp-sd_quota_chunks; c++) + for (o = 0; o PAGE_SIZE; o++) { + byte = sdp-sd_quota_bitmap[c][o]; + if (byte != 0xFF) + goto found; + } + + goto
[PATCH 08/13] GFS: mount and tuning options
There are a variety of mount options, tunable parameters, internal statistics, and methods of online file system manipulation. Signed-off-by: Ken Preslan [EMAIL PROTECTED] Signed-off-by: David Teigland [EMAIL PROTECTED] --- fs/gfs2/ioctl.c | 1485 +++ fs/gfs2/ioctl.h | 15 fs/gfs2/mount.c | 209 +++ fs/gfs2/mount.h | 15 fs/gfs2/resize.c | 285 ++ fs/gfs2/resize.h | 19 fs/gfs2/sys.c| 201 +++ fs/gfs2/sys.h| 24 8 files changed, 2253 insertions(+) --- a/fs/gfs2/ioctl.c 1970-01-01 07:30:00.0 +0730 +++ b/fs/gfs2/ioctl.c 2005-09-01 17:36:55.321114560 +0800 @@ -0,0 +1,1485 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#include linux/sched.h +#include linux/slab.h +#include linux/smp_lock.h +#include linux/spinlock.h +#include linux/completion.h +#include linux/buffer_head.h +#include linux/gfs2_ioctl.h +#include asm/semaphore.h +#include asm/uaccess.h + +#include gfs2.h +#include bmap.h +#include dir.h +#include eattr.h +#include glock.h +#include glops.h +#include inode.h +#include ioctl.h +#include jdata.h +#include log.h +#include meta_io.h +#include quota.h +#include resize.h +#include rgrp.h +#include super.h +#include trans.h + +typedef int (*gi_filler_t) (struct gfs2_inode *ip, + struct gfs2_ioctl *gi, + char *buf, + unsigned int size, + unsigned int *count); + +#define ARG_SIZE 32 + +/** + * gi_skeleton - Setup a buffer that functions can print into + * @ip: + * @gi: + * @filler: + * + * Returns: -errno or count of bytes copied to userspace + */ + +static int gi_skeleton(struct gfs2_inode *ip, struct gfs2_ioctl *gi, + gi_filler_t filler) +{ + unsigned int size = gfs2_tune_get(ip-i_sbd, gt_lockdump_size); + char *buf; + unsigned int count = 0; + int error; + + if (size gi-gi_size) + size = gi-gi_size; + + buf = kmalloc(size, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + error = filler(ip, gi, buf, size, count); + if (error) + goto out; + + if (copy_to_user(gi-gi_data, buf, count + 1)) + error = -EFAULT; + else + error = count + 1; + + out: + kfree(buf); + + return error; +} + +/** + * gi_get_cookie - Return the cookie (identifying string) for a + * filesystem mount + * @ip: + * @gi: + * @buf: + * @size: + * @count: + * + * Returns: errno + */ + +static int gi_get_cookie(struct gfs2_inode *ip, struct gfs2_ioctl *gi, +char *buf, unsigned int size, unsigned int *count) +{ + int error = -ENOBUFS; + + if (gi-gi_argc != 1) + return -EINVAL; + + gfs2_printf(version 0\n); + gfs2_printf(%lu, (unsigned long)ip-i_sbd); + + error = 0; + + out: + return error; +} + +/** + * gi_get_super - Return the struct gfs2_sb for a filesystem + * @sdp: + * @gi: + * + * Returns: errno + */ + +static int gi_get_super(struct gfs2_sbd *sdp, struct gfs2_ioctl *gi) +{ + struct gfs2_holder sb_gh; + struct buffer_head *bh; + struct gfs2_sb *sb; + int error; + + if (gi-gi_argc != 1) + return -EINVAL; + if (gi-gi_size != sizeof(struct gfs2_sb)) + return -EINVAL; + + sb = kmalloc(sizeof(struct gfs2_sb), GFP_KERNEL); + if (!sb) + return -ENOMEM; + + error = gfs2_glock_nq_num(sdp, +GFS2_SB_LOCK, gfs2_meta_glops, +LM_ST_SHARED, 0, sb_gh); + if (error) + goto out; + + error = gfs2_meta_read(sb_gh.gh_gl, + GFS2_SB_ADDR sdp-sd_fsb2bb_shift, + DIO_START | DIO_WAIT, + bh); + if (error) { + gfs2_glock_dq_uninit(sb_gh); + goto out; + } + gfs2_sb_in(sb, bh-b_data); + brelse(bh); + + gfs2_glock_dq_uninit(sb_gh); + + if (copy_to_user(gi-gi_data, sb, +sizeof(struct gfs2_sb))) + error = -EFAULT; + else + error = sizeof(struct gfs2_sb); + + out: + kfree(sb); + + return error; +} + +/** + * gi_get_args - Return the mount arguments + * @ip: + * @gi: + * @buf: + * @size: + * @count: + * + * Returns: errno + */ + +static int gi_get_args(struct gfs2_inode *ip, struct gfs2_ioctl *gi, + char *buf, unsigned int size, unsigned int *count) +{ + struct gfs2_sbd *sdp = ip-i_sbd
[PATCH 05/13] GFS: ea and acl
Code that handles extended attributes and ACL's. Signed-off-by: Ken Preslan [EMAIL PROTECTED] Signed-off-by: David Teigland [EMAIL PROTECTED] --- fs/gfs2/acl.c | 313 ++ fs/gfs2/acl.h | 37 + fs/gfs2/eaops.c | 179 ++ fs/gfs2/eaops.h | 30 + fs/gfs2/eattr.c | 1621 fs/gfs2/eattr.h | 91 +++ 6 files changed, 2271 insertions(+) --- a/fs/gfs2/acl.c 1970-01-01 07:30:00.0 +0730 +++ b/fs/gfs2/acl.c 2005-09-01 17:36:55.135142832 +0800 @@ -0,0 +1,313 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#include linux/sched.h +#include linux/slab.h +#include linux/smp_lock.h +#include linux/spinlock.h +#include linux/completion.h +#include linux/buffer_head.h +#include linux/posix_acl.h +#include linux/posix_acl_xattr.h +#include asm/semaphore.h + +#include gfs2.h +#include acl.h +#include eaops.h +#include eattr.h +#include glock.h +#include inode.h +#include meta_io.h +#include trans.h + +#define ACL_ACCESS 1 +#define ACL_DEFAULT 0 + +int gfs2_acl_validate_set(struct gfs2_inode *ip, int access, + struct gfs2_ea_request *er, + int *remove, mode_t *mode) +{ + struct posix_acl *acl; + int error; + + error = gfs2_acl_validate_remove(ip, access); + if (error) + return error; + + if (!er-er_data) + return -EINVAL; + + acl = posix_acl_from_xattr(er-er_data, er-er_data_len); + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (!acl) { + *remove = TRUE; + return 0; + } + + error = posix_acl_valid(acl); + if (error) + goto out; + + if (access) { + error = posix_acl_equiv_mode(acl, mode); + if (!error) + *remove = TRUE; + else if (error 0) + error = 0; + } + + out: + posix_acl_release(acl); + + return error; +} + +int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access) +{ + if (!ip-i_sbd-sd_args.ar_posix_acl) + return -EOPNOTSUPP; + if (current-fsuid != ip-i_di.di_uid !capable(CAP_FOWNER)) + return -EPERM; + if (S_ISLNK(ip-i_di.di_mode)) + return -EOPNOTSUPP; + if (!access !S_ISDIR(ip-i_di.di_mode)) + return -EACCES; + + return 0; +} + +static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl, + struct gfs2_ea_location *el, char **data, unsigned int *len) +{ + struct gfs2_ea_request er; + struct gfs2_ea_location el_this; + int error; + + if (!ip-i_di.di_eattr) + return 0; + + memset(er, 0, sizeof(struct gfs2_ea_request)); + if (access) { + er.er_name = GFS2_POSIX_ACL_ACCESS; + er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN; + } else { + er.er_name = GFS2_POSIX_ACL_DEFAULT; + er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN; + } + er.er_type = GFS2_EATYPE_SYS; + + if (!el) + el = el_this; + + error = gfs2_ea_find(ip, er, el); + if (error) + return error; + if (!el-el_ea) + return 0; + if (!GFS2_EA_DATA_LEN(el-el_ea)) + goto out; + + er.er_data_len = GFS2_EA_DATA_LEN(el-el_ea); + er.er_data = kmalloc(er.er_data_len, GFP_KERNEL); + error = -ENOMEM; + if (!er.er_data) + goto out; + + error = gfs2_ea_get_copy(ip, el, er.er_data); + if (error) + goto out_kfree; + + if (acl) { + *acl = posix_acl_from_xattr(er.er_data, er.er_data_len); + if (IS_ERR(*acl)) + error = PTR_ERR(*acl); + } + + out_kfree: + if (error || !data) + kfree(er.er_data); + else { + *data = er.er_data; + *len = er.er_data_len; + } + + out: + if (error || el == el_this) + brelse(el-el_bh); + + return error; +} + +/** + * gfs2_check_acl_locked - Check an ACL to see if we're allowed to do something + * @inode: the file we want to do something to + * @mask: what we want to do + * + * Returns: errno + */ + +int gfs2_check_acl_locked(struct inode *inode, int mask) +{ + struct posix_acl *acl = NULL; + int error; + + error = acl_get(get_v2ip(inode), ACL_ACCESS, acl, NULL, NULL, NULL); + if (error) + return error; + + if (acl) { + error = posix_acl_permission(inode, acl, mask
[PATCH 01/14] GFS: headers
Central header files that are widely used. Signed-off-by: Ken Preslan [EMAIL PROTECTED] Signed-off-by: David Teigland [EMAIL PROTECTED] --- fs/gfs2/gfs2.h | 77 +++ fs/gfs2/incore.h| 691 +++ include/linux/gfs2_ioctl.h | 30 + include/linux/gfs2_ondisk.h | 1119 4 files changed, 1917 insertions(+) --- a/fs/gfs2/gfs2.h1970-01-01 07:30:00.0 +0730 +++ b/fs/gfs2/gfs2.h2005-09-01 17:36:55.202132648 +0800 @@ -0,0 +1,77 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#ifndef __GFS2_DOT_H__ +#define __GFS2_DOT_H__ + +#include linux/gfs2_ondisk.h + +#include locking/harness/lm_interface.h +#include lvb.h +#include incore.h +#include util.h + +#ifndef TRUE +#define TRUE 1 +#endif + +#ifndef FALSE +#define FALSE 0 +#endif + +#define NO_CREATE 0 +#define CREATE 1 + +#define NO_WAIT 0 +#define WAIT 1 + +#define NO_FORCE 0 +#define FORCE 1 + +#if (BITS_PER_LONG == 64) +#define PRIu64 lu +#define PRId64 ld +#define PRIx64 lx +#define PRIX64 lX +#else +#define PRIu64 Lu +#define PRId64 Ld +#define PRIx64 Lx +#define PRIX64 LX +#endif + +/* Divide num by den. Round up if there is a remainder. */ +#define DIV_RU(num, den) (((num) + (den) - 1) / (den)) +#define MAKE_MULT8(x) (((x) + 7) ~7) + +#define GFS2_FAST_NAME_SIZE 8 + +#define get_v2sdp(sb) ((struct gfs2_sbd *)(sb)-s_fs_info) +#define set_v2sdp(sb, sdp) (sb)-s_fs_info = (sdp) +#define get_v2ip(inode) ((struct gfs2_inode *)(inode)-u.generic_ip) +#define set_v2ip(inode, ip) (inode)-u.generic_ip = (ip) +#define get_v2fp(file) ((struct gfs2_file *)(file)-private_data) +#define set_v2fp(file, fp) (file)-private_data = (fp) +#define get_v2bd(bh) ((struct gfs2_bufdata *)(bh)-b_private) +#define set_v2bd(bh, bd) (bh)-b_private = (bd) +#define get_v2db(bh) ((struct gfs2_databuf *)(bh)-b_private) +#define set_v2db(bh, db) (bh)-b_private = (db) + +#define get_transaction ((struct gfs2_trans *)(current-journal_info)) +#define set_transaction(tr) (current-journal_info) = (tr) + +#define get_gl2ip(gl) ((struct gfs2_inode *)(gl)-gl_object) +#define set_gl2ip(gl, ip) (gl)-gl_object = (ip) +#define get_gl2rgd(gl) ((struct gfs2_rgrpd *)(gl)-gl_object) +#define set_gl2rgd(gl, rgd) (gl)-gl_object = (rgd) +#define get_gl2gl(gl) ((struct gfs2_glock *)(gl)-gl_object) +#define set_gl2gl(gl, gl2) (gl)-gl_object = (gl2) + +#endif /* __GFS2_DOT_H__ */ + --- a/fs/gfs2/incore.h 1970-01-01 07:30:00.0 +0730 +++ b/fs/gfs2/incore.h 2005-09-01 17:36:55.283120336 +0800 @@ -0,0 +1,691 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#ifndef __INCORE_DOT_H__ +#define __INCORE_DOT_H__ + +#define DIO_FORCE 0x0001 +#define DIO_CLEAN 0x0002 +#define DIO_DIRTY 0x0004 +#define DIO_START 0x0008 +#define DIO_WAIT 0x0010 +#define DIO_METADATA 0x0020 +#define DIO_DATA 0x0040 +#define DIO_RELEASE0x0080 +#define DIO_ALL0x0100 + +struct gfs2_log_operations; +struct gfs2_log_element; +struct gfs2_bitmap; +struct gfs2_rgrpd; +struct gfs2_bufdata; +struct gfs2_databuf; +struct gfs2_glock_operations; +struct gfs2_holder; +struct gfs2_glock; +struct gfs2_alloc; +struct gfs2_inode; +struct gfs2_file; +struct gfs2_revoke; +struct gfs2_revoke_replay; +struct gfs2_unlinked; +struct gfs2_quota_data; +struct gfs2_log_buf; +struct gfs2_trans; +struct gfs2_ail; +struct gfs2_jdesc; +struct gfs2_args; +struct gfs2_tune; +struct gfs2_gl_hash_bucket; +struct gfs2_sbd; + +typedef void (*gfs2_glop_bh_t) (struct gfs2_glock *gl, unsigned int ret); + +/* + * Structure of operations that are associated with each + * type of element in the log. + */ + +struct gfs2_log_operations { + void (*lo_add) (struct gfs2_sbd *sdp, struct gfs2_log_element *le); + void (*lo_incore_commit) (struct gfs2_sbd *sdp, struct gfs2_trans *tr); + void (*lo_before_commit) (struct gfs2_sbd *sdp); + void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_ail *ai); + void (*lo_before_scan) (struct gfs2_jdesc *jd, + struct gfs2_log_header *head, int pass); + int (*lo_scan_elements) (struct gfs2_jdesc *jd, unsigned int start, +struct gfs2_log_descriptor *ld, int pass); + void (*lo_after_scan) (struct gfs2_jdesc *jd, int error, int pass); + char *lo_name
[PATCH 04/13] GFS: allocation
Code that manages block allocation. Signed-off-by: Ken Preslan [EMAIL PROTECTED] Signed-off-by: David Teigland [EMAIL PROTECTED] --- fs/gfs2/bits.c | 179 +++ fs/gfs2/bits.h | 28 + fs/gfs2/rgrp.c | 1374 + fs/gfs2/rgrp.h | 62 ++ 4 files changed, 1643 insertions(+) --- a/fs/gfs2/rgrp.c1970-01-01 07:30:00.0 +0730 +++ b/fs/gfs2/rgrp.c2005-09-01 17:36:55.478090696 +0800 @@ -0,0 +1,1374 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#include linux/sched.h +#include linux/slab.h +#include linux/smp_lock.h +#include linux/spinlock.h +#include linux/completion.h +#include linux/buffer_head.h +#include asm/semaphore.h + +#include gfs2.h +#include bits.h +#include glock.h +#include glops.h +#include jdata.h +#include lops.h +#include meta_io.h +#include quota.h +#include rgrp.h +#include super.h +#include trans.h + +/** + * gfs2_rgrp_verify - Verify that a resource group is consistent + * @sdp: the filesystem + * @rgd: the rgrp + * + */ + +void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd) +{ + struct gfs2_sbd *sdp = rgd-rd_sbd; + struct gfs2_bitmap *bi = NULL; + uint32_t length = rgd-rd_ri.ri_length; + uint32_t count[4], tmp; + int buf, x; + + memset(count, 0, 4 * sizeof(uint32_t)); + + /* Count # blocks in each of 4 possible allocation states */ + for (buf = 0; buf length; buf++) { + bi = rgd-rd_bits + buf; + for (x = 0; x 4; x++) + count[x] += gfs2_bitcount(rgd, + bi-bi_bh-b_data + + bi-bi_offset, + bi-bi_len, x); + } + + if (count[0] != rgd-rd_rg.rg_free) { + if (gfs2_consist_rgrpd(rgd)) + fs_err(sdp, free data mismatch: %u != %u\n, + count[0], rgd-rd_rg.rg_free); + return; + } + + tmp = rgd-rd_ri.ri_data - + rgd-rd_rg.rg_free - + rgd-rd_rg.rg_dinodes; + if (count[1] != tmp) { + if (gfs2_consist_rgrpd(rgd)) + fs_err(sdp, used data mismatch: %u != %u\n, + count[1], tmp); + return; + } + + if (count[2]) { + if (gfs2_consist_rgrpd(rgd)) + fs_err(sdp, free metadata mismatch: %u != 0\n, + count[2]); + return; + } + + if (count[3] != rgd-rd_rg.rg_dinodes) { + if (gfs2_consist_rgrpd(rgd)) + fs_err(sdp, used metadata mismatch: %u != %u\n, + count[3], rgd-rd_rg.rg_dinodes); + return; + } +} + +static inline int rgrp_contains_block(struct gfs2_rindex *ri, uint64_t block) +{ + uint64_t first = ri-ri_data0; + uint64_t last = first + ri-ri_data; + return !!(first = block block last); +} + +/** + * gfs2_blk2rgrpd - Find resource group for a given data/meta block number + * @sdp: The GFS2 superblock + * @n: The data block number + * + * Returns: The resource group, or NULL if not found + */ + +struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, uint64_t blk) +{ + struct gfs2_rgrpd *rgd; + + spin_lock(sdp-sd_rindex_spin); + + list_for_each_entry(rgd, sdp-sd_rindex_mru_list, rd_list_mru) { + if (rgrp_contains_block(rgd-rd_ri, blk)) { + list_move(rgd-rd_list_mru, sdp-sd_rindex_mru_list); + spin_unlock(sdp-sd_rindex_spin); + return rgd; + } + } + + spin_unlock(sdp-sd_rindex_spin); + + return NULL; +} + +/** + * gfs2_rgrpd_get_first - get the first Resource Group in the filesystem + * @sdp: The GFS2 superblock + * + * Returns: The first rgrp in the filesystem + */ + +struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp) +{ + gfs2_assert(sdp, !list_empty(sdp-sd_rindex_list),); + return list_entry(sdp-sd_rindex_list.next, struct gfs2_rgrpd, rd_list); +} + +/** + * gfs2_rgrpd_get_next - get the next RG + * @rgd: A RG + * + * Returns: The next rgrp + */ + +struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd) +{ + if (rgd-rd_list.next == rgd-rd_sbd-sd_rindex_list) + return NULL; + return list_entry(rgd-rd_list.next, struct gfs2_rgrpd, rd_list); +} + +static void clear_rgrpdi(struct gfs2_sbd *sdp) +{ + struct list_head *head; + struct gfs2_rgrpd *rgd; + struct gfs2_glock *gl; + + spin_lock(sdp
[PATCH 03/13] GFS: directories
Code that handles directory operations. Signed-off-by: Ken Preslan [EMAIL PROTECTED] Signed-off-by: David Teigland [EMAIL PROTECTED] --- fs/gfs2/dir.c | 2158 ++ fs/gfs2/dir.h | 51 + 2 files changed, 2209 insertions(+) --- a/fs/gfs2/dir.c 1970-01-01 07:30:00.0 +0730 +++ b/fs/gfs2/dir.c 2005-09-01 17:36:55.180135992 +0800 @@ -0,0 +1,2158 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +/* +* Implements Extendible Hashing as described in: +* Extendible Hashing by Fagin, et al in +* __ACM Trans. on Database Systems__, Sept 1979. +* +* +* Here's the layout of dirents which is essentially the same as that of ext2 +* within a single block. The field de_name_len is the number of bytes +* actually required for the name (no null terminator). The field de_rec_len +* is the number of bytes allocated to the dirent. The offset of the next +* dirent in the block is (dirent + dirent-de_rec_len). When a dirent is +* deleted, the preceding dirent inherits its allocated space, ie +* prev-de_rec_len += deleted-de_rec_len. Since the next dirent is obtained +* by adding de_rec_len to the current dirent, this essentially causes the +* deleted dirent to get jumped over when iterating through all the dirents. +* +* When deleting the first dirent in a block, there is no previous dirent so +* the field de_ino is set to zero to designate it as deleted. When allocating +* a dirent, gfs2_dirent_alloc iterates through the dirents in a block. If the +* first dirent has (de_ino == 0) and de_rec_len is large enough, this first +* dirent is allocated. Otherwise it must go through all the 'used' dirents +* searching for one in which the amount of total space minus the amount of +* used space will provide enough space for the new dirent. +* +* There are two types of blocks in which dirents reside. In a stuffed dinode, +* the dirents begin at offset sizeof(struct gfs2_dinode) from the beginning of +* the block. In leaves, they begin at offset sizeof(struct gfs2_leaf) from the +* beginning of the leaf block. The dirents reside in leaves when +* +* dip-i_di.di_flags GFS2_DIF_EXHASH is true +* +* Otherwise, the dirents are linear, within a single stuffed dinode block. +* +* When the dirents are in leaves, the actual contents of the directory file are +* used as an array of 64-bit block pointers pointing to the leaf blocks. The +* dirents are NOT in the directory file itself. There can be more than one block +* pointer in the array that points to the same leaf. In fact, when a directory +* is first converted from linear to exhash, all of the pointers point to the +* same leaf. +* +* When a leaf is completely full, the size of the hash table can be +* doubled unless it is already at the maximum size which is hard coded into +* GFS2_DIR_MAX_DEPTH. After that, leaves are chained together in a linked list, +* but never before the maximum hash table size has been reached. +*/ + +#include linux/sched.h +#include linux/slab.h +#include linux/smp_lock.h +#include linux/spinlock.h +#include linux/completion.h +#include linux/buffer_head.h +#include linux/sort.h +#include asm/semaphore.h + +#include gfs2.h +#include dir.h +#include glock.h +#include inode.h +#include jdata.h +#include meta_io.h +#include quota.h +#include rgrp.h +#include trans.h + +#define IS_LEAF 1 /* Hashed (leaf) directory */ +#define IS_DINODE 2 /* Linear (stuffed dinode block) directory */ + +#if 1 +#define gfs2_disk_hash2offset(h) (((uint64_t)(h)) 1) +#define gfs2_dir_offset2hash(p) ((uint32_t)(((uint64_t)(p)) 1)) +#else +#define gfs2_disk_hash2offset(h) (((uint64_t)(h))) +#define gfs2_dir_offset2hash(p) ((uint32_t)(((uint64_t)(p +#endif + +typedef int (*leaf_call_t) (struct gfs2_inode *dip, + uint32_t index, uint32_t len, uint64_t leaf_no, + void *data); + +/** + * int gfs2_filecmp - Compare two filenames + * @file1: The first filename + * @file2: The second filename + * @len_of_file2: The length of the second file + * + * This routine compares two filenames and returns TRUE if they are equal. + * + * Returns: TRUE (!=0) if the files are the same, otherwise FALSE (0). + */ + +int gfs2_filecmp(struct qstr *file1, char *file2, int len_of_file2) +{ + if (file1-len != len_of_file2) + return FALSE; + if (memcmp(file1-name, file2, file1-len)) + return FALSE; + return TRUE; +} + +/** + * dirent_first - Return the first dirent + * @dip: the directory + * @bh: The buffer + * @dent: Pointer to list of dirents + * + * return first dirent whether bh points to leaf or stuffed dinode + * + * Returns: IS_LEAF
Re: [PATCH 01/14] GFS: headers
On Thu, Sep 01, 2005 at 04:19:34PM +0200, Arjan van de Ven wrote: +/* Endian functions */ e again why?? Why is this a compiletime hack? Either you care about either-endian on disk, at which point it has to be a runtime thing, or you make the on disk layout fixed endian, at which point you really shouldn't abstract be16_to_cpu etc any further! Again, on-disk is fixed little endian, so we have for example: #define gfs2_32_to_cpu le32_to_cpu #define cpu_to_gfs2_32 cpu_to_le32 To _test_ and _verify_ the endian-handling of the code we can #define GFS2_ENDIAN_BIG which switches the above to: #define gfs2_32_to_cpu to be32_to_cpu #define cpu_to_gfs2_32 to cpu_to_be32 We offered to removed this when I explained it before. It sounds like it would give you some comfort so I'll just go ahead and do it barring any pleas otherwise. Dave - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: GFS, what's remaining
On Thu, Sep 01, 2005 at 06:56:03PM +0100, Christoph Hellwig wrote: Whether the gfs2 code is mergeable is a completely different question, and it seems at least debatable to submit a filesystem for inclusion I actually asked what needs to be done for merging. We appreciate the feedback and are carefully studying and working on all of it as usual. We'd also appreciate help, of course, if that sounds interesting to anyone. Thanks Dave - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: GFS, what's remaining
On Thu, Sep 01, 2005 at 01:35:23PM +0200, Arjan van de Ven wrote: + gfs2_assert(gl-gl_sbd, atomic_read(gl-gl_count) 0,); what is gfs2_assert() about anyway? please just use BUG_ON directly everywhere When a machine has many gfs file systems mounted at once it can be useful to know which one failed. Does the following look ok? #define gfs2_assert(sdp, assertion) \ do { \ if (unlikely(!(assertion))) { \ printk(KERN_ERR \ GFS2: fsid=%s: fatal: assertion \%s\ failed\n \ GFS2: fsid=%s: function = %s\n\ GFS2: fsid=%s: file = %s, line = %u\n \ GFS2: fsid=%s: time = %lu\n, \ sdp-sd_fsname, # assertion, \ sdp-sd_fsname, __FUNCTION__,\ sdp-sd_fsname, __FILE__, __LINE__, \ sdp-sd_fsname, get_seconds()); \ BUG();\ } \ } while (0) - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: GFS, what's remaining
On Thu, Sep 01, 2005 at 01:21:04PM -0700, Andrew Morton wrote: Alan Cox [EMAIL PROTECTED] wrote: - Why GFS is better than OCFS2, or has functionality which OCFS2 cannot possibly gain (or vice versa) - Relative merits of the two offerings You missed the important one - people actively use it and have been for some years. Same reason with have NTFS, HPFS, and all the others. On that alone it makes sense to include. Again, that's not a technical reason. It's _a_ reason, sure. But what are the technical reasons for merging gfs[2], ocfs2, both or neither? If one can be grown to encompass the capabilities of the other then we're left with a bunch of legacy code and wasted effort. GFS is an established fs, it's not going away, you'd be hard pressed to find a more widely used cluster fs on Linux. GFS is about 10 years old and has been in use by customers in production environments for about 5 years. It is a mature, stable file system with many features that have been technically refined over years of experience and customer/user feedback. The latest development cycle (GFS2) has focussed on improving performance, it's not a new file system -- the 2 indicates that it's not ondisk compatible with earlier versions. OCFS2 is a new file system. I expect they'll want to optimize for their own unique goals. When OCFS appeared everyone I know accepted it would coexist with GFS, each in their niche like every other fs. That's good, OCFS and GFS help each other technically even though they may eventually compete in some areas (which can also be good.) Dave Here's a random summary of technical features: - cluster infrastructure: a lot of work, perhaps as much as gfs itself, has gone into the infrastructure surrounding and supporting gfs - cluster infrastructure allows for easy cooperation with CLVM - interchangable lock/cluster modules: gfs interacts with the external infrastructure, including lock manager, through an interchangable module allowing the fs to be adapted to different environments. - a nolock module can be plugged in to use gfs as a local fs (can be selected at mount time, so any fs can be mounted locally) - quotas, acls, cluster flocks, direct io, data journaling, ordered/writeback journaling modes -- all supported - gfs transparently switches to a different locking scheme for direct io allowing parallel non-allocating writes with no lock contention - posix locks -- supported, although it's being reworked for better performance right now - asynchronous locking, lock prefetching + read-ahead - coherent shared-writeable memory mappings across the cluster - nfs3 support (multiple nfs servers exporting one gfs is very common) - extend fs online, add journals online - full fs quiesce to allow for block level snapshot below gfs - read-only mount - specatator mount (like ro but no journal allocated for the mount, no fencing needed for failed node that was mounted as specatator) - infrastructure in place for live ondisk inode migration, fs shrink - stuffed dinodes, small files are stored in the disk inode block - tunable (fuzzy) atime updates - fast, nondisruptive stat on files during non-allocating direct-io - fast, nondisruptive statfs (df) even during heavy fs usage - friendly handling of io errors: shut down fs and withdraw from cluster - largest GFS cluster deployed was around 200 nodes, most are much smaller - use many GFS file systems at once on a node and in a cluster - customers use GFS for: scientific apps, HA, NFS serving, database, others I'm sure - graphical management tools for gfs, clvm, and the cluster infrastruture exist and are improving quickly - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: GFS, what's remaining
On Sat, Sep 03, 2005 at 08:14:00AM +0200, Arjan van de Ven wrote: On Sat, 2005-09-03 at 13:18 +0800, David Teigland wrote: On Thu, Sep 01, 2005 at 01:21:04PM -0700, Andrew Morton wrote: Alan Cox [EMAIL PROTECTED] wrote: - Why GFS is better than OCFS2, or has functionality which OCFS2 cannot possibly gain (or vice versa) - Relative merits of the two offerings You missed the important one - people actively use it and have been for some years. Same reason with have NTFS, HPFS, and all the others. On that alone it makes sense to include. Again, that's not a technical reason. It's _a_ reason, sure. But what are the technical reasons for merging gfs[2], ocfs2, both or neither? If one can be grown to encompass the capabilities of the other then we're left with a bunch of legacy code and wasted effort. GFS is an established fs, it's not going away, you'd be hard pressed to find a more widely used cluster fs on Linux. GFS is about 10 years old and has been in use by customers in production environments for about 5 years. but you submitted GFS2 not GFS. Just a new version, not a big difference. The ondisk format changed a little making it incompatible with the previous versions. We'd been holding out on the format change for a long time and thought now would be a sensible time to finally do it. This is also about timing things conveniently. Each GFS version coincides with a development cycle and we decided to wait for this version/cycle to move code upstream. So, we have new version, format change, and code upstream all together, but it's still the same GFS to us. As with _any_ new version (involving ondisk formats or not) we need to thoroughly test everything to fix the inevitible bugs and regresssions that are introduced, there's nothing new or surprising about that. About the name -- we need to support customers running both versions for a long time. The 2 was added to make that process a little easier and clearer for people, that's all. If the 2 is really distressing we could rip it off, but there seems to be as many file systems ending in digits than not these days... Dave - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: GFS, what's remaining
On Fri, Sep 02, 2005 at 10:28:21PM -0700, Greg KH wrote: On Fri, Sep 02, 2005 at 05:44:03PM +0800, David Teigland wrote: On Thu, Sep 01, 2005 at 01:35:23PM +0200, Arjan van de Ven wrote: + gfs2_assert(gl-gl_sbd, atomic_read(gl-gl_count) 0,); what is gfs2_assert() about anyway? please just use BUG_ON directly everywhere When a machine has many gfs file systems mounted at once it can be useful to know which one failed. Does the following look ok? #define gfs2_assert(sdp, assertion) \ do { \ if (unlikely(!(assertion))) { \ printk(KERN_ERR \ GFS2: fsid=%s: fatal: assertion \%s\ failed\n \ GFS2: fsid=%s: function = %s\n\ GFS2: fsid=%s: file = %s, line = %u\n \ GFS2: fsid=%s: time = %lu\n, \ sdp-sd_fsname, # assertion, \ sdp-sd_fsname, __FUNCTION__,\ sdp-sd_fsname, __FILE__, __LINE__, \ sdp-sd_fsname, get_seconds()); \ BUG();\ You will already get the __FUNCTION__ (and hence the __FILE__ info) directly from the BUG() dump, as well as the time from the syslog message (turn on the printk timestamps if you want a more fine grain timestamp), so the majority of this macro is redundant with the BUG() macro... Joern already suggested moving this out of line and into a function (as it was before) to avoid repeating string constants. In that case the function, file and line from BUG aren't useful. We now have this, does it look ok? void gfs2_assert_i(struct gfs2_sbd *sdp, char *assertion, const char *function, char *file, unsigned int line) { panic(GFS2: fsid=%s: fatal: assertion \%s\ failed\n GFS2: fsid=%s: function = %s, file = %s, line = %u\n, sdp-sd_fsname, assertion, sdp-sd_fsname, function, file, line); } #define gfs2_assert(sdp, assertion) \ do { \ if (unlikely(!(assertion))) { \ gfs2_assert_i((sdp), #assertion, \ __FUNCTION__, __FILE__, __LINE__); \ } \ } while (0) - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [Linux-cluster] Re: GFS, what's remaining
On Sat, Sep 03, 2005 at 10:41:40PM -0700, Andrew Morton wrote: Joel Becker [EMAIL PROTECTED] wrote: What happens when we want to add some new primitive which has no posix-file analog? The point of dlmfs is not to express every primitive that the DLM has. dlmfs cannot express the CR, CW, and PW levels of the VMS locking scheme. Nor should it. The point isn't to use a filesystem interface for programs that need all the flexibility and power of the VMS DLM. The point is a simple system that programs needing the basic operations can use. Even shell scripts. Are you saying that the posix-file lookalike interface provides access to part of the functionality, but there are other APIs which are used to access the rest of the functionality? If so, what is that interface, and why cannot that interface offer access to 100% of the functionality, thus making the posix-file tricks unnecessary? We're using our dlm quite a bit in user space and require the full dlm API. It's difficult to export the full API through a pseudo fs like dlmfs, so we've not found it a very practical approach. That said, it's a nice idea and I'd be happy if someone could map a more complete dlm API onto it. We export our full dlm API through read/write/poll on a misc device. All user space apps use the dlm through a library as you'd expect. The library communicates with the dlm_device kernel module through read/write/poll and the dlm_device module talks with the actual dlm: linux/drivers/dlm/device.c If there's a better way to do this, via a pseudo fs or not, we'd be pleased to try it. Dave - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: GFS, what's remaining
On Thu, Sep 01, 2005 at 01:35:23PM +0200, Arjan van de Ven wrote: +void gfs2_glock_hold(struct gfs2_glock *gl) +{ + glock_hold(gl); +} eh why? You removed the comment stating exactly why, see below. If that's not a accepted technique in the kernel, say so and I'll be happy to change it here and elsewhere. Thanks, Dave static inline void glock_hold(struct gfs2_glock *gl) { gfs2_assert(gl-gl_sbd, atomic_read(gl-gl_count) 0); atomic_inc(gl-gl_count); } /** * gfs2_glock_hold() - As glock_hold(), but suitable for exporting * @gl: The glock to hold * */ void gfs2_glock_hold(struct gfs2_glock *gl) { glock_hold(gl); } - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: GFS, what's remaining
On Thu, Sep 01, 2005 at 01:35:23PM +0200, Arjan van de Ven wrote: +static unsigned int handle_roll(atomic_t *a) +{ + int x = atomic_read(a); + if (x 0) { + atomic_set(a, 0); + return 0; + } + return (unsigned int)x; +} this is just plain scary. Not really, it was just resetting atomic statistics counters when they became negative. Unecessary, though, so removed. Dave - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: GFS, what's remaining
On Mon, Sep 05, 2005 at 10:58:08AM +0200, J?rn Engel wrote: #define gfs2_assert(sdp, assertion) do { \ if (unlikely(!(assertion))) { \ printk(KERN_ERR GFS2: fsid=\n, (sdp)-sd_fsname); \ BUG(); \ } while (0) OK thanks, Dave - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [Linux-cluster] Re: GFS, what's remaining
On Mon, Sep 05, 2005 at 01:54:08AM -0700, Andrew Morton wrote: David Teigland [EMAIL PROTECTED] wrote: We export our full dlm API through read/write/poll on a misc device. inotify did that for a while, but we ended up going with a straight syscall interface. How fat is the dlm interface? ie: how many syscalls would it take? Four functions: create_lockspace() release_lockspace() lock() unlock() Dave - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [Linux-cluster] Re: GFS, what's remaining
On Mon, Sep 05, 2005 at 02:19:48AM -0700, Andrew Morton wrote: David Teigland [EMAIL PROTECTED] wrote: Four functions: create_lockspace() release_lockspace() lock() unlock() Neat. I'd be inclined to make them syscalls then. I don't suppose anyone is likely to object if we reserve those slots. Patrick is really the expert in this area and he's off this week, but based on what he's done with the misc device I don't see why there'd be more than two or three parameters for any of these. Dave - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: GFS
On Mon, Aug 08, 2005 at 05:14:45PM +0300, Pekka J Enberg wrote: if (!dumping) down_read(mm-mmap_sem); + + for (vma = find_vma(mm, start); vma; vma = vma-vm_next) { + if (end = vma-vm_start) + break; + if (vma-vm_file + vma-vm_file-f_dentry-d_inode-i_sb == sb) { + num_gh++; + } + } + + ghs = kmalloc((num_gh + 1) * sizeof(struct gfs2_holder), + GFP_KERNEL); + if (!ghs) { + if (!dumping) + up_read(mm-mmap_sem); + return -ENOMEM; + } + + for (vma = find_vma(mm, start); vma; vma = vma-vm_next) { Sorry if this is an obvious question but what prevents another thread from doing mmap() before we do the second walk and messing up num_gh? mm-mmap_sem ? - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 00/14] GFS
On Tue, Aug 02, 2005 at 09:45:24AM +0200, Arjan van de Ven wrote: * + if (create) + down_write(ip-i_rw_mutex); + else + down_read(ip-i_rw_mutex); why do you use a rwsem and not a regular semaphore? You are aware that rwsems are far more expensive than regular ones right? How skewed is the read/write ratio? Rough tests show around 4/1, that high or low? - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
GFS - updated patches
Thanks for all the review and comments. This is a new set of patches that incorporates the suggestions we've received. http://redhat.com/~teigland/gfs2/20050811/gfs2-full.patch http://redhat.com/~teigland/gfs2/20050811/broken-out/ Dave - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: GFS - updated patches
On Thu, Aug 11, 2005 at 04:21:04PM +0800, Michael wrote: I have the same question as I asked before, how can I see GFS in make menuconfig, after I patch gfs2-full.patch into a 2.6.12.2 kernel? You need to select the dlm under drivers. It's in -mm, or apply http://redhat.com/~teigland/dlm.patch - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: GFS - updated patches
On Thu, Aug 11, 2005 at 10:32:38AM +0200, Arjan van de Ven wrote: On Thu, 2005-08-11 at 16:17 +0800, David Teigland wrote: Thanks for all the review and comments. This is a new set of patches that incorporates the suggestions we've received. all of them or only a subset? All patches, now 01-13 (what was patch 08 disappeared entirely) - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: GFS - updated patches
On Thu, Aug 11, 2005 at 10:50:32AM +0200, Arjan van de Ven wrote: Thanks for all the review and comments. This is a new set of patches that incorporates the suggestions we've received. all of them or only a subset? with them I meant the suggestions not the patches ;) The large majority, and I think all that people care about. If we ignored something that someone thinks is important, a reminder would be useful. - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 1/3] dlm: use configfs
Use configfs to configure lockspace members and node addresses. This was previously done with sysfs and ioctl. Signed-off-by: David Teigland [EMAIL PROTECTED] --- drivers/dlm/Makefile |1 drivers/dlm/config.c | 759 - drivers/dlm/config.h | 12 drivers/dlm/dlm_internal.h |2 drivers/dlm/lockspace.c|7 drivers/dlm/lowcomms.c | 195 +-- drivers/dlm/lowcomms.h |4 drivers/dlm/main.c | 18 - drivers/dlm/member.c | 40 +- drivers/dlm/member_sysfs.c | 76 drivers/dlm/node_ioctl.c | 126 --- drivers/dlm/requestqueue.c |2 include/linux/dlm_node.h | 44 -- 13 files changed, 828 insertions(+), 458 deletions(-) diff -urpN a/drivers/dlm/Makefile b/drivers/dlm/Makefile --- a/drivers/dlm/Makefile 2005-08-17 17:19:22.0 +0800 +++ b/drivers/dlm/Makefile 2005-08-18 13:22:00.718154328 +0800 @@ -12,7 +12,6 @@ dlm-y := ast.o \ member_sysfs.o \ memory.o \ midcomms.o \ - node_ioctl.o \ rcom.o \ recover.o \ recoverd.o \ diff -urpN a/drivers/dlm/config.c b/drivers/dlm/config.c --- a/drivers/dlm/config.c 2005-08-17 17:19:22.0 +0800 +++ b/drivers/dlm/config.c 2005-08-18 13:22:00.719154176 +0800 @@ -11,9 +11,756 @@ *** **/ -#include dlm_internal.h +#include linux/kernel.h +#include linux/module.h +#include linux/configfs.h +#include net/sock.h + #include config.h +/* + * /config/dlm/cluster/spaces/space/nodes/node/nodeid + * /config/dlm/cluster/spaces/space/nodes/node/weight + * /config/dlm/cluster/comms/comm/nodeid + * /config/dlm/cluster/comms/comm/local + * /config/dlm/cluster/comms/comm/addr + * The cluster level is useless, but I haven't figured out how to avoid it. + */ + +static struct config_group *space_list; +static struct config_group *comm_list; +static struct comm *local_comm; + +struct clusters; +struct cluster; +struct spaces; +struct space; +struct comms; +struct comm; +struct nodes; +struct node; + +static struct config_group *make_cluster(struct config_group *, const char *); +static void drop_cluster(struct config_group *, struct config_item *); +static void release_cluster(struct config_item *); +static struct config_group *make_space(struct config_group *, const char *); +static void drop_space(struct config_group *, struct config_item *); +static void release_space(struct config_item *); +static struct config_item *make_comm(struct config_group *, const char *); +static void drop_comm(struct config_group *, struct config_item *); +static void release_comm(struct config_item *); +static struct config_item *make_node(struct config_group *, const char *); +static void drop_node(struct config_group *, struct config_item *); +static void release_node(struct config_item *); + +static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a, +char *buf); +static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a, + const char *buf, size_t len); +static ssize_t show_node(struct config_item *i, struct configfs_attribute *a, +char *buf); +static ssize_t store_node(struct config_item *i, struct configfs_attribute *a, + const char *buf, size_t len); + +static ssize_t comm_nodeid_read(struct comm *cm, char *buf); +static ssize_t comm_nodeid_write(struct comm *cm, const char *buf, size_t len); +static ssize_t comm_local_read(struct comm *cm, char *buf); +static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len); +static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len); +static ssize_t node_nodeid_read(struct node *nd, char *buf); +static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len); +static ssize_t node_weight_read(struct node *nd, char *buf); +static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len); + +enum { + COMM_ATTR_NODEID = 0, + COMM_ATTR_LOCAL, + COMM_ATTR_ADDR, +}; + +struct comm_attribute { + struct configfs_attribute attr; + ssize_t (*show)(struct comm *, char *); + ssize_t (*store)(struct comm *, const char *, size_t); +}; + +static struct comm_attribute comm_attr_nodeid = { + .attr = { .ca_owner = THIS_MODULE, +.ca_name = nodeid, +.ca_mode = S_IRUGO | S_IWUSR }, + .show = comm_nodeid_read, + .store = comm_nodeid_write, +}; + +static struct comm_attribute comm_attr_local = { + .attr = { .ca_owner = THIS_MODULE
[PATCH 2/3] dlm: remove file
The reduced member_sysfs.c is no longer related to lockspace members. Move what's left into lockspace.c which is the only file that uses the remaining functions. Signed-off-by: David Teigland [EMAIL PROTECTED] --- Makefile |1 lockspace.c| 155 +++-- lockspace.h|1 main.c | 14 +--- member.c |2 member_sysfs.c | 165 - member_sysfs.h | 22 --- 7 files changed, 156 insertions(+), 204 deletions(-) diff -urpN a/drivers/dlm/Makefile b/drivers/dlm/Makefile --- a/drivers/dlm/Makefile 2005-08-18 13:26:02.648375344 +0800 +++ b/drivers/dlm/Makefile 2005-08-18 13:26:25.736865360 +0800 @@ -9,7 +9,6 @@ dlm-y :=ast.o \ lowcomms.o \ main.o \ member.o \ - member_sysfs.o \ memory.o \ midcomms.o \ rcom.o \ diff -urpN a/drivers/dlm/lockspace.c b/drivers/dlm/lockspace.c --- a/drivers/dlm/lockspace.c 2005-08-18 13:26:02.651374888 +0800 +++ b/drivers/dlm/lockspace.c 2005-08-18 13:26:25.737865208 +0800 @@ -14,7 +14,6 @@ #include dlm_internal.h #include lockspace.h #include member.h -#include member_sysfs.h #include recoverd.h #include ast.h #include dir.h @@ -38,13 +37,159 @@ static spinlock_t lslist_lock; static struct task_struct *scand_task; +static ssize_t dlm_control_store(struct dlm_ls *ls, const char *buf, size_t len) +{ + ssize_t ret = len; + int n = simple_strtol(buf, NULL, 0); + + switch (n) { + case 0: + dlm_ls_stop(ls); + break; + case 1: + dlm_ls_start(ls); + break; + default: + ret = -EINVAL; + } + return ret; +} + +static ssize_t dlm_event_store(struct dlm_ls *ls, const char *buf, size_t len) +{ + ls-ls_uevent_result = simple_strtol(buf, NULL, 0); + set_bit(LSFL_UEVENT_WAIT, ls-ls_flags); + wake_up(ls-ls_uevent_wait); + return len; +} + +static ssize_t dlm_id_show(struct dlm_ls *ls, char *buf) +{ + return sprintf(buf, %u\n, ls-ls_global_id); +} + +static ssize_t dlm_id_store(struct dlm_ls *ls, const char *buf, size_t len) +{ + ls-ls_global_id = simple_strtoul(buf, NULL, 0); + return len; +} + +struct dlm_attr { + struct attribute attr; + ssize_t (*show)(struct dlm_ls *, char *); + ssize_t (*store)(struct dlm_ls *, const char *, size_t); +}; + +static struct dlm_attr dlm_attr_control = { + .attr = {.name = control, .mode = S_IWUSR}, + .store = dlm_control_store +}; + +static struct dlm_attr dlm_attr_event = { + .attr = {.name = event_done, .mode = S_IWUSR}, + .store = dlm_event_store +}; + +static struct dlm_attr dlm_attr_id = { + .attr = {.name = id, .mode = S_IRUGO | S_IWUSR}, + .show = dlm_id_show, + .store = dlm_id_store +}; + +static struct attribute *dlm_attrs[] = { + dlm_attr_control.attr, + dlm_attr_event.attr, + dlm_attr_id.attr, + NULL, +}; + +static ssize_t dlm_attr_show(struct kobject *kobj, struct attribute *attr, +char *buf) +{ + struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj); + struct dlm_attr *a = container_of(attr, struct dlm_attr, attr); + return a-show ? a-show(ls, buf) : 0; +} + +static ssize_t dlm_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t len) +{ + struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj); + struct dlm_attr *a = container_of(attr, struct dlm_attr, attr); + return a-store ? a-store(ls, buf, len) : len; +} + +static struct sysfs_ops dlm_attr_ops = { + .show = dlm_attr_show, + .store = dlm_attr_store, +}; + +static struct kobj_type dlm_ktype = { + .default_attrs = dlm_attrs, + .sysfs_ops = dlm_attr_ops, +}; + +static struct kset dlm_kset = { + .subsys = kernel_subsys, + .kobj = {.name = dlm,}, + .ktype = dlm_ktype, +}; + +static int kobject_setup(struct dlm_ls *ls) +{ + char lsname[DLM_LOCKSPACE_LEN]; + int error; + + memset(lsname, 0, DLM_LOCKSPACE_LEN); + snprintf(lsname, DLM_LOCKSPACE_LEN, %s, ls-ls_name); + + error = kobject_set_name(ls-ls_kobj, %s, lsname); + if (error) + return error; + + ls-ls_kobj.kset = dlm_kset; + ls-ls_kobj.ktype = dlm_ktype; + return 0; +} + +static int do_uevent(struct dlm_ls *ls, int in) +{ + int error; + + if (in) + kobject_uevent(ls-ls_kobj, KOBJ_ONLINE, NULL); + else + kobject_uevent(ls-ls_kobj, KOBJ_OFFLINE, NULL); + + error
[PATCH 3/3] dlm: use jhash
Use linux/jhash.h instead of our own hash function. Signed-off-by: David Teigland [EMAIL PROTECTED] --- dir.c |2 +- dlm_internal.h |1 + lock.c |2 +- util.c | 34 -- util.h |2 -- 5 files changed, 3 insertions(+), 38 deletions(-) diff -urpN a/drivers/dlm/dir.c b/drivers/dlm/dir.c --- a/drivers/dlm/dir.c 2005-08-17 17:19:22.0 +0800 +++ b/drivers/dlm/dir.c 2005-08-18 13:47:29.112803024 +0800 @@ -120,7 +120,7 @@ static inline uint32_t dir_hash(struct d { uint32_t val; - val = dlm_hash(name, len); + val = jhash(name, len, 0); val = (ls-ls_dirtbl_size - 1); return val; diff -urpN a/drivers/dlm/dlm_internal.h b/drivers/dlm/dlm_internal.h --- a/drivers/dlm/dlm_internal.h2005-08-18 13:26:02.651374888 +0800 +++ b/drivers/dlm/dlm_internal.h2005-08-18 13:47:29.112803024 +0800 @@ -34,6 +34,7 @@ #include linux/kobject.h #include linux/kref.h #include linux/kernel.h +#include linux/jhash.h #include asm/semaphore.h #include asm/uaccess.h diff -urpN a/drivers/dlm/lock.c b/drivers/dlm/lock.c --- a/drivers/dlm/lock.c2005-08-17 17:19:22.0 +0800 +++ b/drivers/dlm/lock.c2005-08-18 13:47:29.114802720 +0800 @@ -369,7 +369,7 @@ static int find_rsb(struct dlm_ls *ls, c if (dlm_no_directory(ls)) flags |= R_CREATE; - hash = dlm_hash(name, namelen); + hash = jhash(name, namelen, 0); bucket = hash (ls-ls_rsbtbl_size - 1); error = search_rsb(ls, name, namelen, bucket, flags, r); diff -urpN a/drivers/dlm/util.c b/drivers/dlm/util.c --- a/drivers/dlm/util.c2005-08-17 17:19:22.0 +0800 +++ b/drivers/dlm/util.c2005-08-18 13:47:29.115802568 +0800 @@ -13,40 +13,6 @@ #include dlm_internal.h #include rcom.h -/** - * dlm_hash - hash an array of data - * @data: the data to be hashed - * @len: the length of data to be hashed - * - * Copied from GFS which copied from... - * - * Take some data and convert it to a 32-bit hash. - * This is the 32-bit FNV-1a hash from: - * http://www.isthe.com/chongo/tech/comp/fnv/ - */ - -static inline uint32_t hash_more_internal(const void *data, unsigned int len, - uint32_t hash) -{ - unsigned char *p = (unsigned char *)data; - unsigned char *e = p + len; - uint32_t h = hash; - - while (p e) { - h ^= (uint32_t)(*p++); - h *= 0x01000193; - } - - return h; -} - -uint32_t dlm_hash(const void *data, int len) -{ - uint32_t h = 0x811C9DC5; - h = hash_more_internal(data, len, h); - return h; -} - static void header_out(struct dlm_header *hd) { hd-h_version = cpu_to_le32(hd-h_version); diff -urpN a/drivers/dlm/util.h b/drivers/dlm/util.h --- a/drivers/dlm/util.h2005-08-17 17:19:22.0 +0800 +++ b/drivers/dlm/util.h2005-08-18 13:47:29.115802568 +0800 @@ -13,8 +13,6 @@ #ifndef __UTIL_DOT_H__ #define __UTIL_DOT_H__ -uint32_t dlm_hash(const char *data, int len); - void dlm_message_out(struct dlm_message *ms); void dlm_message_in(struct dlm_message *ms); void dlm_rcom_out(struct dlm_rcom *rc); - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] configfs: export config_group_find_obj
In the dlm I use config_group_find_obj() which isn't exported. Signed-off-by: David Teigland [EMAIL PROTECTED] diff -urpN a/fs/configfs/item.c b/fs/configfs/item.c --- a/fs/configfs/item.c2005-08-17 17:19:23.0 +0800 +++ b/fs/configfs/item.c2005-08-18 14:15:51.681973168 +0800 @@ -224,4 +224,5 @@ EXPORT_SYMBOL(config_item_init); EXPORT_SYMBOL(config_group_init); EXPORT_SYMBOL(config_item_get); EXPORT_SYMBOL(config_item_put); +EXPORT_SYMBOL(config_group_find_obj); - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 1/3] dlm: use configfs
Are you the official maintainer of the DLM subsystem? Could you submit a patch to add a MAINTAINERS entry? I was looking for a maintainer to yes Signed-off-by: David Teigland [EMAIL PROTECTED] diff -urpN a/MAINTAINERS b/MAINTAINERS --- a/MAINTAINERS 2005-08-17 17:19:23.0 +0800 +++ b/MAINTAINERS 2005-08-18 15:08:41.270122528 +0800 @@ -748,6 +748,13 @@ M: [EMAIL PROTECTED] L: linux-kernel@vger.kernel.org S: Maintained +DLM, DISTRIBUTED LOCK MANAGER +P: David Teigland +M: [EMAIL PROTECTED] +L: [EMAIL PROTECTED] +W: http://sources.redhat.com/cluster +S: Maintained + DAVICOM FAST ETHERNET (DMFE) NETWORK DRIVER P: Tobias Ringstrom M: [EMAIL PROTECTED] - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 1/3] dlm: use configfs
On Thu, Aug 18, 2005 at 02:23:48PM -0700, Mark Fasheh wrote: On Thu, Aug 18, 2005 at 02:07:50PM +0800, David Teigland wrote: + * /config/dlm/cluster/spaces/space/nodes/node/nodeid + * /config/dlm/cluster/spaces/space/nodes/node/weight + * /config/dlm/cluster/comms/comm/nodeid + * /config/dlm/cluster/comms/comm/local + * /config/dlm/cluster/comms/comm/addr So what happened to factoring out the common parts of ocfs2_nodemanager? I was quite a big fan of that approach :) Or am I just misunderstanding what these patches do? The nodemanager RFC I sent a month ago http://marc.theaimsgroup.com/?l=linux-kernelm=112166723919347w=2 amounts to half of dlm/config.c (everything under comms/ above) moved into a separate kernel module. That would be trivial to do, and is still an option to bat around. I question whether factoring such a small chunk into a separate module is really worth it, though? Making all of config.c (all of /config/dlm/ above) into a separate module wouldn't seem quite so strange. It would require just a few lines of code to turn it into a stand alone module. Dave - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [-mm patch] DLM must depend on SYSFS
On 8/23/05, Adrian Bunk [EMAIL PROTECTED] wrote: config DLM tristate Distributed Lock Manager (DLM) + depends on SYSFS depends on IPV6 || IPV6=n select IP_SCTP select CONFIGFS_FS Thanks, you added this once before but it got clobbered by your subsequent depends on IPV6 patch. Dave - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 00/14] GFS
Hi, GFS (Global File System) is a cluster file system that we'd like to see added to the kernel. The 14 patches total about 900K so I won't send them to the list unless that's requested. Comments and suggestions are welcome. Thanks http://redhat.com/~teigland/gfs2/20050801/gfs2-full.patch http://redhat.com/~teigland/gfs2/20050801/broken-out/ Dave - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 00/14] GFS
On Tue, Aug 02, 2005 at 09:45:24AM +0200, Arjan van de Ven wrote: * The on disk structures are defined in terms of uint32_t and friends, which are NOT endian neutral. Why are they not le32/be32 and thus endian-defined? Did you run bitwise-sparse on GFS yet ? GFS has had proper endian handling for many years, it's still correct as far as we've been able to test. I ran bitwise-sparse yesterday and didn't find anything alarming. * None of your on disk structures are packet. Are you sure? Quite, particular attention has been paid to aligning the structure fields, you'll find pad fields throughout. We'll write a quick test to verify that packing doesn't change anything. +#define gfs2_16_to_cpu be16_to_cpu +#define gfs2_32_to_cpu be32_to_cpu +#define gfs2_64_to_cpu be64_to_cpu why this pointless abstracting? #ifdef GFS2_ENDIAN_BIG #define gfs2_16_to_cpu be16_to_cpu #define gfs2_32_to_cpu be32_to_cpu #define gfs2_64_to_cpu be64_to_cpu #define cpu_to_gfs2_16 cpu_to_be16 #define cpu_to_gfs2_32 cpu_to_be32 #define cpu_to_gfs2_64 cpu_to_be64 #else /* GFS2_ENDIAN_BIG */ #define gfs2_16_to_cpu le16_to_cpu #define gfs2_32_to_cpu le32_to_cpu #define gfs2_64_to_cpu le64_to_cpu #define cpu_to_gfs2_16 cpu_to_le16 #define cpu_to_gfs2_32 cpu_to_le32 #define cpu_to_gfs2_64 cpu_to_le64 #endif /* GFS2_ENDIAN_BIG */ The point is you can define GFS2_ENDIAN_BIG to compile gfs to be BE on-disk instead of LE which is another useful way to verify endian correctness. You should be able to use gfs in mixed architecture and mixed endian clusters. We don't have a mixed endian cluster to test, though. * +static const uint32_t crc_32_tab[] = . why do you duplicate this? The kernel has a perfectly good set of generic crc32 tables/functions just fine We'll try them, they'll probably do fine. * Why use your own journalling layer and not say ... jbd ? Here's an analysis of three approaches to cluster-fs journaling and their pros/cons (including using jbd): http://tinyurl.com/7sbqq * + while (!kthread_should_stop()) { + gfs2_scand_internal(sdp); + + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(gfs2_tune_get(sdp, gt_scand_secs) * HZ); + } you probably really want to check for signals if you do interruptible sleeps I don't know why we'd be interested in signals here. * why not use msleep() and friends instead of schedule_timeout(), you're not using the complex variants anyway When unmounting we really appreciate waking up more often than the timeout, otherwise the unmount sits and waits for the longest daemon's msleep to complete. I converted this to msleep recently but it was too painful and had to go back. We'll get to your other comments, thanks. Dave - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 00/14] GFS
On Tue, Aug 02, 2005 at 01:16:53PM +0300, Pekka Enberg wrote: +void *gmalloc_nofail_real(unsigned int size, int flags, char *file, + unsigned int line) +{ + void *x; + for (;;) { + x = kmalloc(size, flags); + if (x) + return x; + if (time_after_eq(jiffies, gfs2_malloc_warning + 5 * HZ)) { + printk(GFS2: out of memory: %s, %u\n, + __FILE__, __LINE__); + gfs2_malloc_warning = jiffies; + } + yield(); This does not belong in a filesystem. It also seems like a very bad idea. What are you trying to do here? If you absolutely must not fail, use __GFP_NOFAIL instead. will do, carried over from before NOFAIL existed -mm has memory leak detection patches and there are others floating around. Please do not introduce yet another subsystem-specific debug allocator. ok, thanks Dave - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 00/14] GFS
On Wed, Aug 03, 2005 at 11:17:09AM +0200, Arjan van de Ven wrote: On Wed, 2005-08-03 at 11:56 +0800, David Teigland wrote: The point is you can define GFS2_ENDIAN_BIG to compile gfs to be BE on-disk instead of LE which is another useful way to verify endian correctness. that sounds wrong to be a compile option. If you really want to deal with dual disk endianness it really ought to be a runtime one (see jffs2 for example). We don't want BE to be an option per se; as developers we'd just like to be able to compile it that way to verify gfs's endianness handling. If you think that's unmaintainable or a bad idea we'll rip it out. * + while (!kthread_should_stop()) { + gfs2_scand_internal(sdp); + + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(gfs2_tune_get(sdp, gt_scand_secs) * HZ); you probably really want to check for signals if you do interruptible sleeps I don't know why we'd be interested in signals here. well.. because if you don't your schedule_timeout becomes a nop when you get one, which makes your loop a busy waiting one. OK, it looks like we need to block/flush signals a la daemonize(); I guess I mistakenly figured the kthread routines did everything daemonize did. Thanks, Dave - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 00/14] GFS
On Tue, Aug 02, 2005 at 09:45:24AM +0200, Arjan van de Ven wrote: * +static const uint32_t crc_32_tab[] = . why do you duplicate this? The kernel has a perfectly good set of generic crc32 tables/functions just fine The gfs2_disk_hash() function and the crc table on which it's based are a part of gfs2_ondisk.h: the ondisk metadata specification. This is a bit unusual since gfs uses a hash table on-disk for its directory structure. This header, including the hash function/table, must be included by user space programs like fsck that want to decipher a fs, and any change to the function or table would effectively make the fs corrupted. Because of this I think it's best for gfs to keep it's own copy as part of its ondisk format spec. * Why are you using bufferheads extensively in a new filesystem? bh's are used for metadata, the log, and journaled data which need to be written at the block granularity, not page. why do you use a rwsem and not a regular semaphore? You are aware that rwsems are far more expensive than regular ones right? How skewed is the read/write ratio? Aware, yes, it's the only rwsem in gfs. Specific skew, no, we'll have to measure that. * +++ b/fs/gfs2/fixed_div64.h 2005-08-01 14:13:08.009808200 +0800 e why? I'm not sure, actually, apart from the comments: do_div: /* For ia32 we need to pull some tricks to get past various versions of the compiler which do not like us using do_div in the middle of large functions. */ do_mod: /* Side effect free 64 bit mod operation */ fs/xfs/linux-2.6/xfs_linux.h (the origin of this file) has the same thing, perhaps this is an old problem that's now fixed? * int gfs2_copy2user(struct buffer_head *bh, char **buf, unsigned int offset, +unsigned int size) +{ + int error; + + if (bh) + error = copy_to_user(*buf, bh-b_data + offset, size); + else + error = clear_user(*buf, size); that looks to be missing a few kmaps.. whats the guarantee that b_data is actually, like in lowmem? This is only used in the specific case of reading a journaled-data file. That seems to effectively be the same as reading a buffer of fs metadata. The diaper device is a block device within gfs that gets transparently inserted between the real device the and rest of the filesystem. h why not use device mapper or something? Is this really needed? This is needed for the withdraw feature (described in the comment) which is fairly important. We'll see if dm could be used instead. Thanks, Dave - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 00/14] GFS
On Fri, Aug 05, 2005 at 09:34:38AM +0200, Arjan van de Ven wrote: On Fri, 2005-08-05 at 15:14 +0800, David Teigland wrote: On Tue, Aug 02, 2005 at 09:45:24AM +0200, Arjan van de Ven wrote: * +static const uint32_t crc_32_tab[] = . why do you duplicate this? The kernel has a perfectly good set of generic crc32 tables/functions just fine The gfs2_disk_hash() function and the crc table on which it's based are a part of gfs2_ondisk.h: the ondisk metadata specification. This is a bit unusual since gfs uses a hash table on-disk for its directory structure. This header, including the hash function/table, must be included by user space programs like fsck that want to decipher a fs, and any change to the function or table would effectively make the fs corrupted. Because of this I think it's best for gfs to keep it's own copy as part of its ondisk format spec. for userspace there's libcrc32 as well. If it's *the* bog standard crc32 I don't see a reason why your spec can't just reference that instead. And esp in the kernel you should just use the in kernel one not your own regardless; you can assume the in kernel one is optimized and it also keeps size down. linux/lib/crc32table.h : crc32table_le[] is the same as our crc_32_tab[]. This looks like a standard that's not going to change, as you've said, so including crc32table.h and getting rid of our own table would work fine. Do we go a step beyond this and use say the crc32() function from linux/crc32.h? Is this _function_ as standard and unchanging as the table of crcs? In my tests it doesn't produce the same results as our gfs2_disk_hash() function, even with both using the same crc table. I don't mind adopting a new function and just writing a user space equivalent for the tools if it's a fixed standard. Dave - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 00/14] GFS
On Fri, Aug 05, 2005 at 12:07:50PM +0200, J?rn Engel wrote: On Fri, 5 August 2005 17:44:52 +0800, David Teigland wrote: Do we go a step beyond this and use say the crc32() function from linux/crc32.h? Is this _function_ as standard and unchanging as the table of crcs? In my tests it doesn't produce the same results as our gfs2_disk_hash() function, even with both using the same crc table. I don't mind adopting a new function and just writing a user space equivalent for the tools if it's a fixed standard. The function is basically set in stone. Variants exists depending on how it is called. I know of four variants, but there may be more: 1. Initial value is 0 2. Initial value is 0x a) Result is taken as-is b) Result is XORed with 0x Maybe your code implements 1a, while you tried 2b with the lib/crc32.c function or something similar? You're right, initial value 0x and xor result with 0x matches the results from our function. Great, we can get rid of gfs2_disk_hash() and use crc32() directly. Thanks, Dave - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 00/14] GFS
On Fri, Aug 05, 2005 at 03:14:15PM +0800, David Teigland wrote: On Tue, Aug 02, 2005 at 09:45:24AM +0200, Arjan van de Ven wrote: * +++ b/fs/gfs2/fixed_div64.h 2005-08-01 14:13:08.009808200 +0800 e why? I'm not sure, actually, apart from the comments: do_div: /* For ia32 we need to pull some tricks to get past various versions of the compiler which do not like us using do_div in the middle of large functions. */ do_mod: /* Side effect free 64 bit mod operation */ fs/xfs/linux-2.6/xfs_linux.h (the origin of this file) has the same thing, perhaps this is an old problem that's now fixed? I've looked into getting rid of these: - The existing do_div() works fine for me with 64 bit numerators, so I'll get rid of the fixed version. - The fixed do_mod() seems to be the only way to do 64 bit modulus. It would be great if I was wrong about that... Thanks, Dave - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 00/14] GFS
On Wed, Aug 03, 2005 at 09:44:06AM +0300, Pekka Enberg wrote: +uint32_t gfs2_hash(const void *data, unsigned int len) +{ + uint32_t h = 0x811C9DC5; + h = hash_more_internal(data, len, h); + return h; +} Is there a reason why you cannot use linux/hash.h or linux/jhash.h? See gfs2_hash_more() and comment; we hash discontiguous regions. +#define RETRY_MALLOC(do_this, until_this) \ +for (;;) { \ + { do_this; } \ + if (until_this) \ + break; \ + if (time_after_eq(jiffies, gfs2_malloc_warning + 5 * HZ)) { \ + printk(GFS2: out of memory: %s, %u\n, __FILE__, __LINE__); \ + gfs2_malloc_warning = jiffies; \ + } \ + yield(); \ +} Please drop this. Done in the spot that could deal with an error, but there are three other places that still need it. +static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er, + struct gfs2_ea_location *el) +{ + { + struct ea_set es; + int error; + + memset(es, 0, sizeof(struct ea_set)); + es.es_er = er; + es.es_el = el; + + error = ea_foreach(ip, ea_set_simple, es); + if (error 0) + return 0; + if (error) + return error; + } + { + unsigned int blks = 2; + if (!(ip-i_di.di_flags GFS2_DIF_EA_INDIRECT)) + blks++; + if (GFS2_EAREQ_SIZE_STUFFED(er) ip-i_sbd-sd_jbsize) + blks += DIV_RU(er-er_data_len, +ip-i_sbd-sd_jbsize); + + return ea_alloc_skeleton(ip, er, blks, ea_set_block, el); + } Please drop the extra braces. Here and elsewhere we try to keep unused stuff off the stack. Are you suggesting that we're being overly cautious, or do you just dislike the way it looks? Thanks, Dave - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: GFS
On Mon, Aug 08, 2005 at 01:18:45PM +0300, Pekka J Enberg wrote: gfs2-02.patch:+ RETRY_MALLOC(ip = kmem_cache_alloc(gfs2_inode_cachep, - GFP_NOFAIL. Already gone, inode_create() can return an error. if (create) { RETRY_MALLOC(page = grab_cache_page(aspace-i_mapping, index), page); } else { page = find_lock_page(aspace-i_mapping, index); if (!page) return NULL; } I think you can set aspace-flags to GFP_NOFAIL will try that but why can't you return NULL here on failure like you do for find_lock_page()? because create is set gfs2-02.patch:+ RETRY_MALLOC(bd = kmem_cache_alloc(gfs2_bufdata_cachep, GFP_KERNEL), - GFP_NOFAIL It looks to me like NOFAIL does nothing for kmem_cache_alloc(). Am I seeing that wrong? gfs2-10.patch:+ RETRY_MALLOC(new_gh = gfs2_holder_get(gl, state, gfs2_holder_get uses kmalloc which can use GFP_NOFAIL. Which means adding a new gfp_flags parameter... fine. Dave - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: GFS
On Mon, Aug 08, 2005 at 01:57:55PM +0300, Pekka J Enberg wrote: David Teigland writes: but why can't you return NULL here on failure like you do for find_lock_page()? because create is set Yes, but looking at (some of the) top-level callers, there's no real reason why create must not fail. Am I missing something here? I'll trace the callers back farther and see about dealing with errors. gfs2-02.patch:+ RETRY_MALLOC(bd = kmem_cache_alloc(gfs2_bufdata_cachep, It is passed to the page allocator just like with kmalloc() which uses __cache_alloc() too. Yes, I read it wrongly, looks like NOFAIL should work fine. I think we can get rid of the RETRY macro entirely. Thanks, Dave - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: GFS, what's remaining
On Thu, Sep 01, 2005 at 01:35:23PM +0200, Arjan van de Ven wrote: +static inline void glock_put(struct gfs2_glock *gl) +{ + if (atomic_read(gl-gl_count) == 1) + gfs2_glock_schedule_for_reclaim(gl); + gfs2_assert(gl-gl_sbd, atomic_read(gl-gl_count) 0,); + atomic_dec(gl-gl_count); +} this code has a race The first two lines of the function with the race are non-essential and could be removed. In the common case where there's no race, they just add efficiency by moving the glock to the reclaim list immediately. Otherwise, the scand thread would do it later when actively trying to reclaim glocks. +static inline int queue_empty(struct gfs2_glock *gl, struct list_head *head) +{ + int empty; + spin_lock(gl-gl_spin); + empty = list_empty(head); + spin_unlock(gl-gl_spin); + return empty; +} that looks like a racey interface to me... if so.. why bother locking at all? The spinlock protects the list but is not the primary method of synchronizing processes that are working with a glock. When the list is in fact empty, there will be no race, and the locking wouldn't be necessary. In this case, the glmutex in the code fragment below is preventing any change in the list, so we can safely release the spinlock immediately. When the list is not empty, then a process could be adding another entry to the list without glmutex locked [1], making the spinlock necessary. In this case we quit after queue_empty() returns and don't do anything else, so releasing the spinlock immediately was still safe. [1] A process that already holds a glock (i.e. has a holder struct on the gl_holders list) is allowed to hold it again by adding another holder struct to the same list. It adds the second hold without locking glmutex. if (gfs2_glmutex_trylock(gl)) { if (gl-gl_ops == gfs2_inode_glops) { struct gfs2_inode *ip = get_gl2ip(gl); if (ip !atomic_read(ip-i_count)) gfs2_inode_destroy(ip); } if (queue_empty(gl, gl-gl_holders) gl-gl_state != LM_ST_UNLOCKED) handle_callback(gl, LM_ST_UNLOCKED); gfs2_glmutex_unlock(gl); } There is a second way that queue_empty() is used, and that's within assertions that the list is empty. If the assertion is correct, locking isn't necessary; locking is only needed if there's already another bug causing the list to not be empty and the assertion to fail. static int gi_skeleton(struct gfs2_inode *ip, struct gfs2_ioctl *gi, +gi_filler_t filler) +{ + unsigned int size = gfs2_tune_get(ip-i_sbd, gt_lockdump_size); + char *buf; + unsigned int count = 0; + int error; + + if (size gi-gi_size) + size = gi-gi_size; + + buf = kmalloc(size, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + error = filler(ip, gi, buf, size, count); + if (error) + goto out; + + if (copy_to_user(gi-gi_data, buf, count + 1)) + error = -EFAULT; where does count get a sensible value? from filler() We'll add comments in the code to document the things above. Thanks, Dave - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] dlm: check the maximum size of a request from user
On Sun, Sep 09, 2012 at 04:16:58PM +0200, Sasha Levin wrote: device_write only checks whether the request size is big enough, but it doesn't check if the size is too big. At that point, it also tries to allocate as much memory as the user has requested even if it's too much. This can lead to OOM killer kicking in, or memory corruption if (count + 1) overflows. thanks, pushed to next -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[GIT PULL] dlm updates for 3.6
Hi Linus, Please pull dlm updates from tag: git://git.kernel.org/pub/scm/linux/kernel/git/teigland/linux-dlm.git dlm-3.6 This set includes a major redesign of recording the master node for resources. The old dir hash table, which just held the master node for each resource, has been removed. The rsb hash table has always duplicated the master node value from the dir, and is now the single record of it. Having two full hash tables of all resources has always been a waste, especially since one just duplicated a single value from the other. Local requests will now often require one instead of two lengthy hash table searches. The other substantial change is made possible by the dirtbl removal, and fixes a long standing race between resource removal and lookup by reworking how removal is done. At the same time it improves the efficiency of removal by avoiding repeated searches through a hash bucket. The other commits include minor fixes and changes. Thanks, Dave David Teigland (6): dlm: use rsbtbl as resource directory dlm: use idr instead of list for recovered rsbs dlm: fix race between remove and lookup dlm: use wait_event_timeout dlm: fix conversion deadlock from recovery dlm: fix missing dir remove fs/dlm/config.c |7 - fs/dlm/config.h |1 - fs/dlm/debug_fs.c | 103 +++- fs/dlm/dir.c | 287 +++ fs/dlm/dir.h |7 +- fs/dlm/dlm_internal.h | 62 ++- fs/dlm/lock.c | 1292 - fs/dlm/lock.h |5 +- fs/dlm/lockspace.c| 45 +- fs/dlm/rcom.c | 147 -- fs/dlm/rcom.h |1 + fs/dlm/recover.c | 295 +++ fs/dlm/recover.h |2 +- fs/dlm/recoverd.c | 14 +- 14 files changed, 1600 insertions(+), 668 deletions(-) -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] dlm: cleanup send_to_sock routine
On Mon, Aug 13, 2012 at 02:29:55PM +0800, Ying Xue wrote: But I have submitted another patch: https://lkml.org/lkml/2012/8/9/668 Yes I got it, to see all the patches I've pushed out, check here: http://git.kernel.org/gitweb.cgi?p=linux/kernel/git/teigland/linux-dlm.git;a=shortlog;h=refs/heads/next Remove unnecessary code form send_to_sock routine. ok -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] dlm: convert add_sock routine return value type to void
On Fri, Aug 10, 2012 at 02:58:42PM +0800, Ying Xue wrote: Since add_sock() always returns a success code - 0, its return value type should be changed from integer to void. Thanks, I've pushed those to my next branch. Dave -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 00/16] dlm patches for 2.6.25 (batch 2)
This is a second batch of dlm patches that I think should be ready for 2.6.25. It's largely clean ups and minor fixes from Al Viro, and a couple more added by me after Al pointed them out. Thanks, Dave Al Viro (13): dlm: use proper C for dlm/requestqueue stuff (and fix alignment bug) dlm: dlm_process_incoming_buffer() fixes dlm: do not byteswap rcom_lock dlm: do not byteswap rcom_config dlm: use proper type for -ls_recover_buf dlm: missing length check in check_config() dlm: validate data in dlm_recover_directory() dlm: verify that places expecting rcom_lock have packet long enough dlm: receive_rcom_lock_args() overflow check dlm: make find_rsb() fail gracefully when namelen is too large dlm: fix overflows when copying from -m_extra to lvb dlm: fix dlm_dir_lookup() handling of too long names dlm: dlm/user.c input validation fixes David Teigland (2): dlm: proper types for asts and basts dlm: eliminate astparam type casting Denis Cheng (1): dlm: add __init and __exit marks to init and exit functions fs/dlm/ast.c |9 ++-- fs/dlm/config.c |2 +- fs/dlm/debug_fs.c |8 +-- fs/dlm/dir.c | 28 +-- fs/dlm/dlm_internal.h | 53 +++ fs/dlm/lock.c | 139 + fs/dlm/lock.h |2 +- fs/dlm/lockspace.c|2 +- fs/dlm/memory.c |4 +- fs/dlm/midcomms.c | 33 +++- fs/dlm/netlink.c |9 ++-- fs/dlm/rcom.c | 63 ++ fs/dlm/recover.c |4 +- fs/dlm/requestqueue.c | 12 ++-- fs/dlm/requestqueue.h |2 +- fs/dlm/user.c | 29 -- fs/dlm/util.c | 61 - 17 files changed, 235 insertions(+), 225 deletions(-) -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 04/16] dlm: do not byteswap rcom_config
From: Al Viro [EMAIL PROTECTED] Signed-off-by: Al Viro [EMAIL PROTECTED] Signed-off-by: David Teigland [EMAIL PROTECTED] --- fs/dlm/dlm_internal.h |6 +++--- fs/dlm/rcom.c | 15 --- fs/dlm/util.c | 20 3 files changed, 11 insertions(+), 30 deletions(-) diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index e73b988..187a5b5 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -410,9 +410,9 @@ union dlm_packet { }; struct rcom_config { - uint32_trf_lvblen; - uint32_trf_lsflags; - uint64_trf_unused; + __le32 rf_lvblen; + __le32 rf_lsflags; + __le64 rf_unused; }; struct rcom_lock { diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c index 86c1ab9..fb07762 100644 --- a/fs/dlm/rcom.c +++ b/fs/dlm/rcom.c @@ -78,8 +78,8 @@ static void send_rcom(struct dlm_ls *ls, struct dlm_mhandle *mh, static void make_config(struct dlm_ls *ls, struct rcom_config *rf) { - rf-rf_lvblen = ls-ls_lvblen; - rf-rf_lsflags = ls-ls_exflags; + rf-rf_lvblen = cpu_to_le32(ls-ls_lvblen); + rf-rf_lsflags = cpu_to_le32(ls-ls_exflags); } static int check_config(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid) @@ -93,11 +93,12 @@ static int check_config(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid) return -EPROTO; } - if (rf-rf_lvblen != ls-ls_lvblen || - rf-rf_lsflags != ls-ls_exflags) { + if (le32_to_cpu(rf-rf_lvblen) != ls-ls_lvblen || + le32_to_cpu(rf-rf_lsflags) != ls-ls_exflags) { log_error(ls, config mismatch: %d,%x nodeid %d: %d,%x, - ls-ls_lvblen, ls-ls_exflags, - nodeid, rf-rf_lvblen, rf-rf_lsflags); + ls-ls_lvblen, ls-ls_exflags, nodeid, + le32_to_cpu(rf-rf_lvblen), + le32_to_cpu(rf-rf_lsflags)); return -EPROTO; } return 0; @@ -401,7 +402,7 @@ int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in) rc-rc_result = -ESRCH; rf = (struct rcom_config *) rc-rc_buf; - rf-rf_lvblen = -1; + rf-rf_lvblen = cpu_to_le32(~0U); dlm_rcom_out(rc); dlm_lowcomms_commit_buffer(mh); diff --git a/fs/dlm/util.c b/fs/dlm/util.c index d3ed6da..e36520a 100644 --- a/fs/dlm/util.c +++ b/fs/dlm/util.c @@ -131,22 +131,8 @@ void dlm_message_in(struct dlm_message *ms) ms-m_result= from_dlm_errno(le32_to_cpu(ms-m_result)); } -static void rcom_config_out(struct rcom_config *rf) -{ - rf-rf_lvblen = cpu_to_le32(rf-rf_lvblen); - rf-rf_lsflags = cpu_to_le32(rf-rf_lsflags); -} - -static void rcom_config_in(struct rcom_config *rf) -{ - rf-rf_lvblen = le32_to_cpu(rf-rf_lvblen); - rf-rf_lsflags = le32_to_cpu(rf-rf_lsflags); -} - void dlm_rcom_out(struct dlm_rcom *rc) { - int type = rc-rc_type; - header_out(rc-rc_header); rc-rc_type = cpu_to_le32(rc-rc_type); @@ -154,9 +140,6 @@ void dlm_rcom_out(struct dlm_rcom *rc) rc-rc_id = cpu_to_le64(rc-rc_id); rc-rc_seq = cpu_to_le64(rc-rc_seq); rc-rc_seq_reply= cpu_to_le64(rc-rc_seq_reply); - - if (type == DLM_RCOM_STATUS_REPLY) - rcom_config_out((struct rcom_config *) rc-rc_buf); } void dlm_rcom_in(struct dlm_rcom *rc) @@ -168,7 +151,4 @@ void dlm_rcom_in(struct dlm_rcom *rc) rc-rc_id = le64_to_cpu(rc-rc_id); rc-rc_seq = le64_to_cpu(rc-rc_seq); rc-rc_seq_reply= le64_to_cpu(rc-rc_seq_reply); - - if (rc-rc_type == DLM_RCOM_STATUS_REPLY) - rcom_config_in((struct rcom_config *) rc-rc_buf); } -- 1.5.3.3 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 07/16] dlm: validate data in dlm_recover_directory()
From: Al Viro [EMAIL PROTECTED] Signed-off-by: Al Viro [EMAIL PROTECTED] Signed-off-by: David Teigland [EMAIL PROTECTED] --- fs/dlm/dir.c | 23 --- 1 files changed, 20 insertions(+), 3 deletions(-) diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c index ce30136..831050e 100644 --- a/fs/dlm/dir.c +++ b/fs/dlm/dir.c @@ -220,6 +220,7 @@ int dlm_recover_directory(struct dlm_ls *ls) last_len = 0; for (;;) { + int left; error = dlm_recovery_stopped(ls); if (error) goto out_free; @@ -236,11 +237,20 @@ int dlm_recover_directory(struct dlm_ls *ls) */ b = ls-ls_recover_buf-rc_buf; + left = ls-ls_recover_buf-rc_header.h_length; + left -= sizeof(struct dlm_rcom); for (;;) { - memcpy(namelen, b, sizeof(uint16_t)); - namelen = be16_to_cpu(namelen); - b += sizeof(uint16_t); + __be16 v; + + error = -EINVAL; + if (left sizeof(__be16)) + goto out_free; + + memcpy(v, b, sizeof(__be16)); + namelen = be16_to_cpu(v); + b += sizeof(__be16); + left -= sizeof(__be16); /* namelen of 0xF marks end of names for this node; namelen of 0 marks end of the @@ -251,6 +261,12 @@ int dlm_recover_directory(struct dlm_ls *ls) if (!namelen) break; + if (namelen left) + goto out_free; + + if (namelen DLM_RESNAME_MAXLEN) + goto out_free; + error = -ENOMEM; de = get_free_de(ls, namelen); if (!de) @@ -262,6 +278,7 @@ int dlm_recover_directory(struct dlm_ls *ls) memcpy(de-name, b, namelen); memcpy(last_name, b, namelen); b += namelen; + left -= namelen; add_entry_to_hash(ls, de); count++; -- 1.5.3.3 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 02/16] dlm: dlm_process_incoming_buffer() fixes
From: Al Viro [EMAIL PROTECTED] * check that length is large enough to cover the non-variable part of message or rcom resp. (after checking that it's large enough to cover the header, of course). * kill more pointless casts Signed-off-by: Al Viro [EMAIL PROTECTED] Signed-off-by: David Teigland [EMAIL PROTECTED] --- fs/dlm/dlm_internal.h |6 ++ fs/dlm/lock.c | 19 +-- fs/dlm/lock.h |2 +- fs/dlm/midcomms.c | 33 - 4 files changed, 36 insertions(+), 24 deletions(-) diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index ec61bba..65499ce 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -403,6 +403,12 @@ struct dlm_rcom { charrc_buf[0]; }; +union dlm_packet { + struct dlm_header header; /* common to other two */ + struct dlm_message message; + struct dlm_rcom rcom; +}; + struct rcom_config { uint32_trf_lvblen; uint32_trf_lsflags; diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index d9f07a4..2a28048 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -3822,21 +3822,20 @@ void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms) standard locking activity) or an RCOM (recovery message sent as part of lockspace recovery). */ -void dlm_receive_buffer(struct dlm_header *hd, int nodeid) +void dlm_receive_buffer(union dlm_packet *p, int nodeid) { - struct dlm_message *ms = (struct dlm_message *) hd; - struct dlm_rcom *rc = (struct dlm_rcom *) hd; + struct dlm_header *hd = p-header; struct dlm_ls *ls; int type = 0; switch (hd-h_cmd) { case DLM_MSG: - dlm_message_in(ms); - type = ms-m_type; + dlm_message_in(p-message); + type = p-message.m_type; break; case DLM_RCOM: - dlm_rcom_in(rc); - type = rc-rc_type; + dlm_rcom_in(p-rcom); + type = p-rcom.rc_type; break; default: log_print(invalid h_cmd %d from %u, hd-h_cmd, nodeid); @@ -3856,7 +3855,7 @@ void dlm_receive_buffer(struct dlm_header *hd, int nodeid) hd-h_lockspace, nodeid, hd-h_cmd, type); if (hd-h_cmd == DLM_RCOM type == DLM_RCOM_STATUS) - dlm_send_ls_not_ready(nodeid, rc); + dlm_send_ls_not_ready(nodeid, p-rcom); return; } @@ -3865,9 +3864,9 @@ void dlm_receive_buffer(struct dlm_header *hd, int nodeid) down_read(ls-ls_recv_active); if (hd-h_cmd == DLM_MSG) - dlm_receive_message(ls, ms, nodeid); + dlm_receive_message(ls, p-message, nodeid); else - dlm_receive_rcom(ls, rc, nodeid); + dlm_receive_rcom(ls, p-rcom, nodeid); up_read(ls-ls_recv_active); dlm_put_lockspace(ls); diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h index 27b6ed3..05d9c82 100644 --- a/fs/dlm/lock.h +++ b/fs/dlm/lock.h @@ -17,7 +17,7 @@ void dlm_print_rsb(struct dlm_rsb *r); void dlm_dump_rsb(struct dlm_rsb *r); void dlm_print_lkb(struct dlm_lkb *lkb); void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms); -void dlm_receive_buffer(struct dlm_header *hd, int nodeid); +void dlm_receive_buffer(union dlm_packet *p, int nodeid); int dlm_modes_compat(int mode1, int mode2); void dlm_put_rsb(struct dlm_rsb *r); void dlm_hold_rsb(struct dlm_rsb *r); diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index e69926e..07ac709 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -61,9 +61,9 @@ int dlm_process_incoming_buffer(int nodeid, const void *base, union { unsigned char __buf[DLM_INBUF_LEN]; /* this is to force proper alignment on some arches */ - struct dlm_header dlm; + union dlm_packet p; } __tmp; - struct dlm_header *msg = __tmp.dlm; + union dlm_packet *p = __tmp.p; int ret = 0; int err = 0; uint16_t msglen; @@ -75,15 +75,22 @@ int dlm_process_incoming_buffer(int nodeid, const void *base, message may wrap around the end of the buffer back to the start, so we need to use a temp buffer and copy_from_cb. */ - copy_from_cb(msg, base, offset, sizeof(struct dlm_header), + copy_from_cb(p, base, offset, sizeof(struct dlm_header), limit); - msglen = le16_to_cpu(msg-h_length); - lockspace = msg-h_lockspace; + msglen = le16_to_cpu(p-header.h_length); + lockspace = p-header.h_lockspace; err = -EINVAL; if (msglen sizeof(struct dlm_header)) break
[PATCH 03/16] dlm: do not byteswap rcom_lock
From: Al Viro [EMAIL PROTECTED] Signed-off-by: Al Viro [EMAIL PROTECTED] Signed-off-by: David Teigland [EMAIL PROTECTED] --- fs/dlm/dlm_internal.h | 22 +++--- fs/dlm/lock.c | 34 +++--- fs/dlm/rcom.c | 14 +++--- fs/dlm/util.c | 45 ++--- 4 files changed, 39 insertions(+), 76 deletions(-) diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 65499ce..e73b988 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -416,21 +416,21 @@ struct rcom_config { }; struct rcom_lock { - uint32_trl_ownpid; - uint32_trl_lkid; - uint32_trl_remid; - uint32_trl_parent_lkid; - uint32_trl_parent_remid; - uint32_trl_exflags; - uint32_trl_flags; - uint32_trl_lvbseq; - int rl_result; + __le32 rl_ownpid; + __le32 rl_lkid; + __le32 rl_remid; + __le32 rl_parent_lkid; + __le32 rl_parent_remid; + __le32 rl_exflags; + __le32 rl_flags; + __le32 rl_lvbseq; + __le32 rl_result; int8_t rl_rqmode; int8_t rl_grmode; int8_t rl_status; int8_t rl_asts; - uint16_trl_wait_type; - uint16_trl_namelen; + __le16 rl_wait_type; + __le16 rl_namelen; charrl_name[DLM_RESNAME_MAXLEN]; charrl_lvb[0]; }; diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 2a28048..75176b5 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -4273,12 +4273,12 @@ static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, int lvblen; lkb-lkb_nodeid = rc-rc_header.h_nodeid; - lkb-lkb_ownpid = rl-rl_ownpid; - lkb-lkb_remid = rl-rl_lkid; - lkb-lkb_exflags = rl-rl_exflags; - lkb-lkb_flags = rl-rl_flags 0x; + lkb-lkb_ownpid = le32_to_cpu(rl-rl_ownpid); + lkb-lkb_remid = le32_to_cpu(rl-rl_lkid); + lkb-lkb_exflags = le32_to_cpu(rl-rl_exflags); + lkb-lkb_flags = le32_to_cpu(rl-rl_flags) 0x; lkb-lkb_flags |= DLM_IFL_MSTCPY; - lkb-lkb_lvbseq = rl-rl_lvbseq; + lkb-lkb_lvbseq = le32_to_cpu(rl-rl_lvbseq); lkb-lkb_rqmode = rl-rl_rqmode; lkb-lkb_grmode = rl-rl_grmode; /* don't set lkb_status because add_lkb wants to itself */ @@ -4299,7 +4299,8 @@ static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, The real granted mode of these converting locks cannot be determined until all locks have been rebuilt on the rsb (recover_conversion) */ - if (rl-rl_wait_type == DLM_MSG_CONVERT middle_conversion(lkb)) { + if (rl-rl_wait_type == cpu_to_le16(DLM_MSG_CONVERT) + middle_conversion(lkb)) { rl-rl_status = DLM_LKSTS_CONVERT; lkb-lkb_grmode = DLM_LOCK_IV; rsb_set_flag(r, RSB_RECOVER_CONVERT); @@ -4326,13 +4327,14 @@ int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc) goto out; } - error = find_rsb(ls, rl-rl_name, rl-rl_namelen, R_MASTER, r); + error = find_rsb(ls, rl-rl_name, le16_to_cpu(rl-rl_namelen), +R_MASTER, r); if (error) goto out; lock_rsb(r); - lkb = search_remid(r, rc-rc_header.h_nodeid, rl-rl_lkid); + lkb = search_remid(r, rc-rc_header.h_nodeid, le32_to_cpu(rl-rl_lkid)); if (lkb) { error = -EEXIST; goto out_remid; @@ -4355,15 +4357,16 @@ int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc) out_remid: /* this is the new value returned to the lock holder for saving in its process-copy lkb */ - rl-rl_remid = lkb-lkb_id; + rl-rl_remid = cpu_to_le32(lkb-lkb_id); out_unlock: unlock_rsb(r); put_rsb(r); out: if (error) - log_debug(ls, recover_master_copy %d %x, error, rl-rl_lkid); - rl-rl_result = error; + log_debug(ls, recover_master_copy %d %x, error, + le32_to_cpu(rl-rl_lkid)); + rl-rl_result = cpu_to_le32(error); return error; } @@ -4374,15 +4377,16 @@ int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc) struct dlm_lkb *lkb; int error; - error = find_lkb(ls, rl-rl_lkid, lkb); + error = find_lkb(ls, le32_to_cpu(rl-rl_lkid), lkb); if (error) { - log_error(ls
[PATCH 08/16] dlm: verify that places expecting rcom_lock have packet long enough
From: Al Viro [EMAIL PROTECTED] Signed-off-by: Al Viro [EMAIL PROTECTED] Signed-off-by: David Teigland [EMAIL PROTECTED] --- fs/dlm/lock.c |3 +++ fs/dlm/rcom.c | 12 +++- 2 files changed, 14 insertions(+), 1 deletions(-) diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 75176b5..6c605fc 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -4266,6 +4266,7 @@ static struct dlm_lkb *search_remid(struct dlm_rsb *r, int nodeid, return NULL; } +/* needs at least dlm_rcom + rcom_lock */ static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, struct dlm_rsb *r, struct dlm_rcom *rc) { @@ -4315,6 +4316,7 @@ static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, the given values and send back our lkid. We send back our lkid by sending back the rcom_lock struct we got but with the remid field filled in. */ +/* needs at least dlm_rcom + rcom_lock */ int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc) { struct rcom_lock *rl = (struct rcom_lock *) rc-rc_buf; @@ -4370,6 +4372,7 @@ int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc) return error; } +/* needs at least dlm_rcom + rcom_lock */ int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc) { struct rcom_lock *rl = (struct rcom_lock *) rc-rc_buf; diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c index a312f1d..ef9d0f9 100644 --- a/fs/dlm/rcom.c +++ b/fs/dlm/rcom.c @@ -357,6 +357,7 @@ int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) return error; } +/* needs at least dlm_rcom + rcom_lock */ static void receive_rcom_lock(struct dlm_ls *ls, struct dlm_rcom *rc_in) { struct dlm_rcom *rc; @@ -448,6 +449,8 @@ static int is_old_reply(struct dlm_ls *ls, struct dlm_rcom *rc) void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid) { + int lock_size = sizeof(struct dlm_rcom) + sizeof(struct rcom_lock); + if (dlm_recovery_stopped(ls) (rc-rc_type != DLM_RCOM_STATUS)) { log_debug(ls, ignoring recovery message %x from %d, rc-rc_type, nodeid); @@ -471,6 +474,8 @@ void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid) break; case DLM_RCOM_LOCK: + if (rc-rc_header.h_length lock_size) + goto Eshort; receive_rcom_lock(ls, rc); break; @@ -487,13 +492,18 @@ void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid) break; case DLM_RCOM_LOCK_REPLY: + if (rc-rc_header.h_length lock_size) + goto Eshort; dlm_recover_process_copy(ls, rc); break; default: log_error(ls, receive_rcom bad type %d, rc-rc_type); } - out: +out: return; +Eshort: + log_error(ls, recovery message %x from %d is too short, + rc-rc_type, nodeid); } -- 1.5.3.3 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 06/16] dlm: missing length check in check_config()
From: Al Viro [EMAIL PROTECTED] Signed-off-by: Al Viro [EMAIL PROTECTED] Signed-off-by: David Teigland [EMAIL PROTECTED] --- fs/dlm/rcom.c |7 +++ 1 files changed, 7 insertions(+), 0 deletions(-) diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c index 3f9b96f..a312f1d 100644 --- a/fs/dlm/rcom.c +++ b/fs/dlm/rcom.c @@ -85,6 +85,7 @@ static void make_config(struct dlm_ls *ls, struct rcom_config *rf) static int check_config(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid) { struct rcom_config *rf = (struct rcom_config *) rc-rc_buf; + size_t conf_size = sizeof(struct dlm_rcom) + sizeof(struct rcom_config); if ((rc-rc_header.h_version 0x) != DLM_HEADER_MAJOR) { log_error(ls, version mismatch: %x nodeid %d: %x, @@ -93,6 +94,12 @@ static int check_config(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid) return -EPROTO; } + if (rc-rc_header.h_length conf_size) { + log_error(ls, config too short: %d nodeid %d, + rc-rc_header.h_length, nodeid); + return -EPROTO; + } + if (le32_to_cpu(rf-rf_lvblen) != ls-ls_lvblen || le32_to_cpu(rf-rf_lsflags) != ls-ls_exflags) { log_error(ls, config mismatch: %d,%x nodeid %d: %d,%x, -- 1.5.3.3 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 13/16] dlm: dlm/user.c input validation fixes
From: Al Viro [EMAIL PROTECTED] a) in device_write(): add sentinel NUL byte, making sure that lspace.name will be NUL-terminated b) in compat_input() be keep it simple about the amounts of data we are copying. Signed-off-by: Al Viro [EMAIL PROTECTED] Signed-off-by: David Teigland [EMAIL PROTECTED] --- fs/dlm/user.c | 19 --- 1 files changed, 8 insertions(+), 11 deletions(-) diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 7cbc682..c306045 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -82,7 +82,7 @@ struct dlm_lock_result32 { static void compat_input(struct dlm_write_request *kb, struct dlm_write_request32 *kb32, -int max_namelen) +size_t count) { kb-version[0] = kb32-version[0]; kb-version[1] = kb32-version[1]; @@ -94,7 +94,8 @@ static void compat_input(struct dlm_write_request *kb, kb-cmd == DLM_USER_REMOVE_LOCKSPACE) { kb-i.lspace.flags = kb32-i.lspace.flags; kb-i.lspace.minor = kb32-i.lspace.minor; - strcpy(kb-i.lspace.name, kb32-i.lspace.name); + memcpy(kb-i.lspace.name, kb32-i.lspace.name, count - + offsetof(struct dlm_write_request32, i.lspace.name)); } else if (kb-cmd == DLM_USER_PURGE) { kb-i.purge.nodeid = kb32-i.purge.nodeid; kb-i.purge.pid = kb32-i.purge.pid; @@ -112,11 +113,8 @@ static void compat_input(struct dlm_write_request *kb, kb-i.lock.bastaddr = (void *)(long)kb32-i.lock.bastaddr; kb-i.lock.lksb = (void *)(long)kb32-i.lock.lksb; memcpy(kb-i.lock.lvb, kb32-i.lock.lvb, DLM_USER_LVB_LEN); - if (kb-i.lock.namelen = max_namelen) - memcpy(kb-i.lock.name, kb32-i.lock.name, - kb-i.lock.namelen); - else - kb-i.lock.namelen = max_namelen; + memcpy(kb-i.lock.name, kb32-i.lock.name, count - + offsetof(struct dlm_write_request32, i.lock.name)); } } @@ -508,7 +506,7 @@ static ssize_t device_write(struct file *file, const char __user *buf, #endif return -EINVAL; - kbuf = kmalloc(count, GFP_KERNEL); + kbuf = kzalloc(count + 1, GFP_KERNEL); if (!kbuf) return -ENOMEM; @@ -526,15 +524,14 @@ static ssize_t device_write(struct file *file, const char __user *buf, if (!kbuf-is64bit) { struct dlm_write_request32 *k32buf; k32buf = (struct dlm_write_request32 *)kbuf; - kbuf = kmalloc(count + (sizeof(struct dlm_write_request) - + kbuf = kmalloc(count + 1 + (sizeof(struct dlm_write_request) - sizeof(struct dlm_write_request32)), GFP_KERNEL); if (!kbuf) return -ENOMEM; if (proc) set_bit(DLM_PROC_FLAGS_COMPAT, proc-flags); - compat_input(kbuf, k32buf, -count - sizeof(struct dlm_write_request32)); + compat_input(kbuf, k32buf, count + 1); kfree(k32buf); } #endif -- 1.5.3.3 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 05/16] dlm: use proper type for -ls_recover_buf
From: Al Viro [EMAIL PROTECTED] Signed-off-by: Al Viro [EMAIL PROTECTED] Signed-off-by: David Teigland [EMAIL PROTECTED] --- fs/dlm/dir.c |2 +- fs/dlm/dlm_internal.h |2 +- fs/dlm/rcom.c | 11 ++- fs/dlm/recover.c |4 ++-- 4 files changed, 10 insertions(+), 9 deletions(-) diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c index ff97ba9..ce30136 100644 --- a/fs/dlm/dir.c +++ b/fs/dlm/dir.c @@ -235,7 +235,7 @@ int dlm_recover_directory(struct dlm_ls *ls) * pick namelen/name pairs out of received buffer */ - b = ls-ls_recover_buf + sizeof(struct dlm_rcom); + b = ls-ls_recover_buf-rc_buf; for (;;) { memcpy(namelen, b, sizeof(uint16_t)); diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 187a5b5..f7fbaec 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -500,7 +500,7 @@ struct dlm_ls { struct rw_semaphore ls_recv_active; /* block dlm_recv */ struct list_headls_requestqueue;/* queue remote requests */ struct mutexls_requestqueue_mutex; - char*ls_recover_buf; + struct dlm_rcom *ls_recover_buf; int ls_recover_nodeid; /* for debugging */ uint64_tls_rcom_seq; spinlock_t ls_rcom_spin; diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c index fb07762..3f9b96f 100644 --- a/fs/dlm/rcom.c +++ b/fs/dlm/rcom.c @@ -129,7 +129,7 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid) ls-ls_recover_nodeid = nodeid; if (nodeid == dlm_our_nodeid()) { - rc = (struct dlm_rcom *) ls-ls_recover_buf; + rc = ls-ls_recover_buf; rc-rc_result = dlm_recover_status(ls); goto out; } @@ -148,7 +148,7 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid) if (error) goto out; - rc = (struct dlm_rcom *) ls-ls_recover_buf; + rc = ls-ls_recover_buf; if (rc-rc_result == -ESRCH) { /* we pretend the remote lockspace exists with 0 status */ @@ -202,14 +202,15 @@ int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len) { struct dlm_rcom *rc; struct dlm_mhandle *mh; - int error = 0, len = sizeof(struct dlm_rcom); + int error = 0; + int max_size = dlm_config.ci_buffer_size - sizeof(struct dlm_rcom); ls-ls_recover_nodeid = nodeid; if (nodeid == dlm_our_nodeid()) { dlm_copy_master_names(ls, last_name, last_len, - ls-ls_recover_buf + len, - dlm_config.ci_buffer_size - len, nodeid); + ls-ls_recover_buf-rc_buf, + max_size, nodeid); goto out; } diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c index df075dc..80aba5b 100644 --- a/fs/dlm/recover.c +++ b/fs/dlm/recover.c @@ -94,7 +94,7 @@ void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status) static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status) { - struct dlm_rcom *rc = (struct dlm_rcom *) ls-ls_recover_buf; + struct dlm_rcom *rc = ls-ls_recover_buf; struct dlm_member *memb; int error = 0, delay; @@ -123,7 +123,7 @@ static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status) static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status) { - struct dlm_rcom *rc = (struct dlm_rcom *) ls-ls_recover_buf; + struct dlm_rcom *rc = ls-ls_recover_buf; int error = 0, delay = 0, nodeid = ls-ls_low_nodeid; for (;;) { -- 1.5.3.3 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 01/16] dlm: use proper C for dlm/requestqueue stuff (and fix alignment bug)
From: Al Viro [EMAIL PROTECTED] a) don't cast the pointer to dlm_header *, we use it as dlm_message * anyway. b) we copy the message into a queue element, then pass the pointer to copy to dlm_receive_message_saved(); declare it properly to make sure that we have the right alignment. Signed-off-by: Al Viro [EMAIL PROTECTED] Signed-off-by: David Teigland [EMAIL PROTECTED] --- fs/dlm/lock.c |2 +- fs/dlm/requestqueue.c | 12 ++-- fs/dlm/requestqueue.h |2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index ff4a198..d9f07a4 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -3802,7 +3802,7 @@ static void dlm_receive_message(struct dlm_ls *ls, struct dlm_message *ms, int nodeid) { if (dlm_locking_stopped(ls)) { - dlm_add_requestqueue(ls, nodeid, (struct dlm_header *) ms); + dlm_add_requestqueue(ls, nodeid, ms); } else { dlm_wait_requestqueue(ls); _receive_message(ls, ms); diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c index 0de04f1..daa4183 100644 --- a/fs/dlm/requestqueue.c +++ b/fs/dlm/requestqueue.c @@ -20,7 +20,7 @@ struct rq_entry { struct list_head list; int nodeid; - char request[0]; + struct dlm_message request; }; /* @@ -30,10 +30,10 @@ struct rq_entry { * lockspace is enabled on some while still suspended on others. */ -void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd) +void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms) { struct rq_entry *e; - int length = hd-h_length; + int length = ms-m_header.h_length - sizeof(struct dlm_message); e = kmalloc(sizeof(struct rq_entry) + length, GFP_KERNEL); if (!e) { @@ -42,7 +42,7 @@ void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd) } e-nodeid = nodeid; - memcpy(e-request, hd, length); + memcpy(e-request, ms, ms-m_header.h_length); mutex_lock(ls-ls_requestqueue_mutex); list_add_tail(e-list, ls-ls_requestqueue); @@ -76,7 +76,7 @@ int dlm_process_requestqueue(struct dlm_ls *ls) e = list_entry(ls-ls_requestqueue.next, struct rq_entry, list); mutex_unlock(ls-ls_requestqueue_mutex); - dlm_receive_message_saved(ls, (struct dlm_message *)e-request); + dlm_receive_message_saved(ls, e-request); mutex_lock(ls-ls_requestqueue_mutex); list_del(e-list); @@ -176,7 +176,7 @@ void dlm_purge_requestqueue(struct dlm_ls *ls) mutex_lock(ls-ls_requestqueue_mutex); list_for_each_entry_safe(e, safe, ls-ls_requestqueue, list) { - ms = (struct dlm_message *) e-request; + ms = e-request; if (purge_request(ls, ms, e-nodeid)) { list_del(e-list); diff --git a/fs/dlm/requestqueue.h b/fs/dlm/requestqueue.h index aba34fc..10ce449 100644 --- a/fs/dlm/requestqueue.h +++ b/fs/dlm/requestqueue.h @@ -13,7 +13,7 @@ #ifndef __REQUESTQUEUE_DOT_H__ #define __REQUESTQUEUE_DOT_H__ -void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd); +void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms); int dlm_process_requestqueue(struct dlm_ls *ls); void dlm_wait_requestqueue(struct dlm_ls *ls); void dlm_purge_requestqueue(struct dlm_ls *ls); -- 1.5.3.3 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 09/16] dlm: receive_rcom_lock_args() overflow check
From: Al Viro [EMAIL PROTECTED] Signed-off-by: Al Viro [EMAIL PROTECTED] Signed-off-by: David Teigland [EMAIL PROTECTED] --- fs/dlm/lock.c |7 --- 1 files changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 6c605fc..0593dd8 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -4271,7 +4271,6 @@ static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, struct dlm_rsb *r, struct dlm_rcom *rc) { struct rcom_lock *rl = (struct rcom_lock *) rc-rc_buf; - int lvblen; lkb-lkb_nodeid = rc-rc_header.h_nodeid; lkb-lkb_ownpid = le32_to_cpu(rl-rl_ownpid); @@ -4288,11 +4287,13 @@ static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, lkb-lkb_astaddr = (void *) (long) (rl-rl_asts AST_COMP); if (lkb-lkb_exflags DLM_LKF_VALBLK) { + int lvblen = rc-rc_header.h_length - sizeof(struct dlm_rcom) - +sizeof(struct rcom_lock); + if (lvblen ls-ls_lvblen) + return -EINVAL; lkb-lkb_lvbptr = dlm_allocate_lvb(ls); if (!lkb-lkb_lvbptr) return -ENOMEM; - lvblen = rc-rc_header.h_length - sizeof(struct dlm_rcom) - -sizeof(struct rcom_lock); memcpy(lkb-lkb_lvbptr, rl-rl_lvb, lvblen); } -- 1.5.3.3 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 10/16] dlm: make find_rsb() fail gracefully when namelen is too large
From: Al Viro [EMAIL PROTECTED] We *can* get there from receive_request() and dlm_recover_master_copy() with namelen too large if incoming request is invalid; BUG() from DLM_ASSERT() in allocate_rsb() is a bit excessive reaction to that and in case of dlm_recover_master_copy() we would actually oops before that while calculating hash of up to 64Kb worth of data - with data actually being 64 _bytes_ in kmalloc()'ed struct. Signed-off-by: Al Viro [EMAIL PROTECTED] Signed-off-by: David Teigland [EMAIL PROTECTED] --- fs/dlm/lock.c |6 +- 1 files changed, 5 insertions(+), 1 deletions(-) diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 0593dd8..6d98cf9 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -436,11 +436,15 @@ static int find_rsb(struct dlm_ls *ls, char *name, int namelen, { struct dlm_rsb *r, *tmp; uint32_t hash, bucket; - int error = 0; + int error = -EINVAL; + + if (namelen DLM_RESNAME_MAXLEN) + goto out; if (dlm_no_directory(ls)) flags |= R_CREATE; + error = 0; hash = jhash(name, namelen, 0); bucket = hash (ls-ls_rsbtbl_size - 1); -- 1.5.3.3 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 12/16] dlm: fix dlm_dir_lookup() handling of too long names
From: Al Viro [EMAIL PROTECTED] ... those can happen and BUG() from DLM_ASSERT() in allocate_direntry() is not a good way to handle them. Signed-off-by: Al Viro [EMAIL PROTECTED] Signed-off-by: David Teigland [EMAIL PROTECTED] --- fs/dlm/dir.c |3 +++ 1 files changed, 3 insertions(+), 0 deletions(-) diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c index 831050e..85defeb 100644 --- a/fs/dlm/dir.c +++ b/fs/dlm/dir.c @@ -319,6 +319,9 @@ static int get_entry(struct dlm_ls *ls, int nodeid, char *name, write_unlock(ls-ls_dirtbl[bucket].lock); + if (namelen DLM_RESNAME_MAXLEN) + return -EINVAL; + de = kzalloc(sizeof(struct dlm_direntry) + namelen, GFP_KERNEL); if (!de) return -ENOMEM; -- 1.5.3.3 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 11/16] dlm: fix overflows when copying from -m_extra to lvb
From: Al Viro [EMAIL PROTECTED] Signed-off-by: Al Viro [EMAIL PROTECTED] Signed-off-by: David Teigland [EMAIL PROTECTED] --- fs/dlm/lock.c |4 1 files changed, 4 insertions(+), 0 deletions(-) diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 6d98cf9..5b82187 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -1226,6 +1226,8 @@ static void set_lvb_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb, b = dlm_lvb_operations[lkb-lkb_grmode + 1][lkb-lkb_rqmode + 1]; if (b == 1) { int len = receive_extralen(ms); + if (len DLM_RESNAME_MAXLEN) + len = DLM_RESNAME_MAXLEN; memcpy(lkb-lkb_lvbptr, ms-m_extra, len); lkb-lkb_lvbseq = ms-m_lvbseq; } @@ -2993,6 +2995,8 @@ static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb, if (!lkb-lkb_lvbptr) return -ENOMEM; len = receive_extralen(ms); + if (len DLM_RESNAME_MAXLEN) + len = DLM_RESNAME_MAXLEN; memcpy(lkb-lkb_lvbptr, ms-m_extra, len); } return 0; -- 1.5.3.3 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 15/16] dlm: eliminate astparam type casting
Put lkb_astparam in a union with a dlm_user_args pointer to eliminate a lot of type casting. Signed-off-by: David Teigland [EMAIL PROTECTED] --- fs/dlm/debug_fs.c |6 ++ fs/dlm/dlm_internal.h |5 - fs/dlm/lock.c | 14 ++ fs/dlm/memory.c |2 +- fs/dlm/netlink.c |5 ++--- fs/dlm/user.c |8 +++- 6 files changed, 18 insertions(+), 22 deletions(-) diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c index 12c3bfd..52b1196 100644 --- a/fs/dlm/debug_fs.c +++ b/fs/dlm/debug_fs.c @@ -162,14 +162,12 @@ static int print_resource(struct dlm_rsb *res, struct seq_file *s) static void print_lock(struct seq_file *s, struct dlm_lkb *lkb, struct dlm_rsb *r) { - struct dlm_user_args *ua; unsigned int waiting = 0; uint64_t xid = 0; if (lkb-lkb_flags DLM_IFL_USER) { - ua = (struct dlm_user_args *) lkb-lkb_astparam; - if (ua) - xid = ua-xid; + if (lkb-lkb_ua) + xid = lkb-lkb_ua-xid; } if (lkb-lkb_timestamp) diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index a53c237..d30ea8b 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -253,7 +253,10 @@ struct dlm_lkb { struct dlm_lksb *lkb_lksb; /* caller's status block */ void(*lkb_astfn) (void *astparam); void(*lkb_bastfn) (void *astparam, int mode); - void*lkb_astparam; /* caller's ast arg */ + union { + void*lkb_astparam; /* caller's ast arg */ + struct dlm_user_args*lkb_ua; + }; }; diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 94f8cbd..8f250ac 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -4533,7 +4533,7 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, /* user can change the params on its lock when it converts it, or add an lvb that didn't exist before */ - ua = (struct dlm_user_args *)lkb-lkb_astparam; + ua = lkb-lkb_ua; if (flags DLM_LKF_VALBLK !ua-lksb.sb_lvbptr) { ua-lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_KERNEL); @@ -4584,7 +4584,7 @@ int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, if (error) goto out; - ua = (struct dlm_user_args *)lkb-lkb_astparam; + ua = lkb-lkb_ua; if (lvb_in ua-lksb.sb_lvbptr) memcpy(ua-lksb.sb_lvbptr, lvb_in, DLM_USER_LVB_LEN); @@ -4633,7 +4633,7 @@ int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, if (error) goto out; - ua = (struct dlm_user_args *)lkb-lkb_astparam; + ua = lkb-lkb_ua; if (ua_tmp-castparam) ua-castparam = ua_tmp-castparam; ua-user_lksb = ua_tmp-user_lksb; @@ -4671,7 +4671,7 @@ int dlm_user_deadlock(struct dlm_ls *ls, uint32_t flags, uint32_t lkid) if (error) goto out; - ua = (struct dlm_user_args *)lkb-lkb_astparam; + ua = lkb-lkb_ua; error = set_unlock_args(flags, ua, args); if (error) @@ -4710,7 +4710,6 @@ int dlm_user_deadlock(struct dlm_ls *ls, uint32_t flags, uint32_t lkid) static int orphan_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb) { - struct dlm_user_args *ua = (struct dlm_user_args *)lkb-lkb_astparam; struct dlm_args args; int error; @@ -4719,7 +4718,7 @@ static int orphan_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb) list_add_tail(lkb-lkb_ownqueue, ls-ls_orphans); mutex_unlock(ls-ls_orphans_mutex); - set_unlock_args(0, ua, args); + set_unlock_args(0, lkb-lkb_ua, args); error = cancel_lock(ls, lkb, args); if (error == -DLM_ECANCEL) @@ -4732,11 +4731,10 @@ static int orphan_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb) static int unlock_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb) { - struct dlm_user_args *ua = (struct dlm_user_args *)lkb-lkb_astparam; struct dlm_args args; int error; - set_unlock_args(DLM_LKF_FORCEUNLOCK, ua, args); + set_unlock_args(DLM_LKF_FORCEUNLOCK, lkb-lkb_ua, args); error = unlock_lock(ls, lkb, args); if (error == -DLM_EUNLOCK) diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c index f778386..65e41e5 100644 --- a/fs/dlm/memory.c +++ b/fs/dlm/memory.c @@ -80,7 +80,7 @@ void dlm_free_lkb(struct dlm_lkb *lkb) { if (lkb-lkb_flags DLM_IFL_USER) { struct dlm_user_args *ua; - ua = (struct dlm_user_args *)lkb-lkb_astparam; + ua = lkb-lkb_ua; if (ua) { if (ua-lksb.sb_lvbptr) kfree(ua-lksb.sb_lvbptr); diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c index 863b87d..90374b8 100644 --- a/fs/dlm
[PATCH 14/16] dlm: proper types for asts and basts
Use proper types for ast and bast functions, and use consistent type for ast param. Signed-off-by: David Teigland [EMAIL PROTECTED] --- fs/dlm/ast.c |9 +++ fs/dlm/dlm_internal.h | 14 +--- fs/dlm/lock.c | 50 +++- fs/dlm/rcom.c |4 +- 4 files changed, 44 insertions(+), 33 deletions(-) diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c index 6308122..8bf31e3 100644 --- a/fs/dlm/ast.c +++ b/fs/dlm/ast.c @@ -39,7 +39,6 @@ void dlm_add_ast(struct dlm_lkb *lkb, int type) dlm_user_add_ast(lkb, type); return; } - DLM_ASSERT(lkb-lkb_astaddr != DLM_FAKE_USER_AST, dlm_print_lkb(lkb);); spin_lock(ast_queue_lock); if (!(lkb-lkb_ast_type (AST_COMP | AST_BAST))) { @@ -58,8 +57,8 @@ static void process_asts(void) struct dlm_ls *ls = NULL; struct dlm_rsb *r = NULL; struct dlm_lkb *lkb; - void (*cast) (long param); - void (*bast) (long param, int mode); + void (*cast) (void *astparam); + void (*bast) (void *astparam, int mode); int type = 0, found, bmode; for (;;) { @@ -83,8 +82,8 @@ static void process_asts(void) if (!found) break; - cast = lkb-lkb_astaddr; - bast = lkb-lkb_bastaddr; + cast = lkb-lkb_astfn; + bast = lkb-lkb_bastfn; bmode = lkb-lkb_bastmode; if ((type AST_COMP) cast) diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index f7fbaec..a53c237 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -92,8 +92,6 @@ do { \ } \ } -#define DLM_FAKE_USER_AST ERR_PTR(-EINVAL) - struct dlm_direntry { struct list_headlist; @@ -146,9 +144,9 @@ struct dlm_recover { struct dlm_args { uint32_tflags; - void*astaddr; - longastparam; - void*bastaddr; + void(*astfn) (void *astparam); + void*astparam; + void(*bastfn) (void *astparam, int mode); int mode; struct dlm_lksb *lksb; unsigned long timeout; @@ -253,9 +251,9 @@ struct dlm_lkb { char*lkb_lvbptr; struct dlm_lksb *lkb_lksb; /* caller's status block */ - void*lkb_astaddr; /* caller's ast function */ - void*lkb_bastaddr; /* caller's bast function */ - longlkb_astparam; /* caller's ast arg */ + void(*lkb_astfn) (void *astparam); + void(*lkb_bastfn) (void *astparam, int mode); + void*lkb_astparam; /* caller's ast arg */ }; diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 5b82187..94f8cbd 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -1781,7 +1781,7 @@ static void grant_pending_locks(struct dlm_rsb *r) */ list_for_each_entry_safe(lkb, s, r-res_grantqueue, lkb_statequeue) { - if (lkb-lkb_bastaddr lock_requires_bast(lkb, high, cw)) { + if (lkb-lkb_bastfn lock_requires_bast(lkb, high, cw)) { if (cw high == DLM_LOCK_PR) queue_bast(r, lkb, DLM_LOCK_CW); else @@ -1811,7 +1811,7 @@ static void send_bast_queue(struct dlm_rsb *r, struct list_head *head, struct dlm_lkb *gr; list_for_each_entry(gr, head, lkb_statequeue) { - if (gr-lkb_bastaddr modes_require_bast(gr, lkb)) { + if (gr-lkb_bastfn modes_require_bast(gr, lkb)) { queue_bast(r, gr, lkb-lkb_rqmode); gr-lkb_highbast = lkb-lkb_rqmode; } @@ -1966,8 +1966,11 @@ static void confirm_master(struct dlm_rsb *r, int error) } static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags, -int namelen, unsigned long timeout_cs, void *ast, -void *astarg, void *bast, struct dlm_args *args) +int namelen, unsigned long timeout_cs, +void (*ast) (void *astparam), +void *astparam, +void (*bast) (void *astparam, int mode), +struct dlm_args *args) { int rv = -EINVAL; @@ -2017,9 +2020,9 @@ static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags, an active lkb cannot be modified before locking the rsb */ args-flags = flags; - args-astaddr = ast; - args-astparam = (long) astarg; - args-bastaddr = bast; + args-astfn = ast; + args-astparam = astparam; + args-bastfn = bast; args-timeout = timeout_cs
[GIT PULL] dlm updates for 2.6.25 (batch 2)
Linus, Please pull a second batch of dlm updates for 2.6.25 from: git://git.kernel.org/pub/scm/linux/kernel/git/teigland/dlm.git for-linus Al Viro (13): dlm: use proper C for dlm/requestqueue stuff (and fix alignment bug) dlm: dlm_process_incoming_buffer() fixes dlm: do not byteswap rcom_lock dlm: do not byteswap rcom_config dlm: use proper type for -ls_recover_buf dlm: missing length check in check_config() dlm: validate data in dlm_recover_directory() dlm: verify that places expecting rcom_lock have packet long enough dlm: receive_rcom_lock_args() overflow check dlm: make find_rsb() fail gracefully when namelen is too large dlm: fix overflows when copying from -m_extra to lvb dlm: fix dlm_dir_lookup() handling of too long names dlm: dlm/user.c input validation fixes David Teigland (2): dlm: proper types for asts and basts dlm: eliminate astparam type casting Denis Cheng (1): dlm: add __init and __exit marks to init and exit functions fs/dlm/ast.c |9 ++-- fs/dlm/config.c |2 +- fs/dlm/debug_fs.c |8 +-- fs/dlm/dir.c | 28 +-- fs/dlm/dlm_internal.h | 53 +++ fs/dlm/lock.c | 139 + fs/dlm/lock.h |2 +- fs/dlm/lockspace.c|2 +- fs/dlm/memory.c |4 +- fs/dlm/midcomms.c | 33 +++- fs/dlm/netlink.c |9 ++-- fs/dlm/rcom.c | 63 ++ fs/dlm/recover.c |4 +- fs/dlm/requestqueue.c | 12 ++-- fs/dlm/requestqueue.h |2 +- fs/dlm/user.c | 29 -- fs/dlm/util.c | 61 - 17 files changed, 235 insertions(+), 225 deletions(-) -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/