msgpairarray-decode.patch:
------------------------
This cleans up how decoding errors (for responses) are handled in msgpairarray. It now saves the error in a slot specific to each particular msgpair, so that the problem will show up in EDETAIL if possible, rather than as a generic problem with all servers.

sys-immed-complete.patch:
------------------------
I think this may have already been fixed independently by Sam, but we can compare notes (I haven't looked at what's in CVS yet). This makes sure that items that complete without the sys wait/test function being called get removed from the completion queue properly. It looks like this function just wasn't ever completely filled in. The most reliable way to trigger it is to use mgmt_get_dfile_array() repeatedly (as when fscking a large fs). That function always completes without testing, and so after 256 iterations it fills the completion queue and hits an assertion.

sys-null-tabfile.patch:
-----------------------
This probably wasn't really hurting much, but the cleanup is good anyway. This patch makes the system interface skip over any tabfiles (in its array of tabfiles to try) that are NULL, rather than calling open() on them. The semantics haven't changed with this patch, but we don't have to trust the open() function to be polite when we give it a NULL parameter.

sys-remove-fsck-warnings.patch:
------------------------
This just adds more verbose error messages to sys-remove to help track down a problem. We did find an issue later, I think, but I don't have that patch ready yet.

-Phil
Index: pvfs2_src/src/common/misc/msgpairarray.sm
===================================================================
--- pvfs2_src/src/common/misc/msgpairarray.sm	(revision 1591)
+++ pvfs2_src/src/common/misc/msgpairarray.sm	(revision 1592)
@@ -529,17 +529,16 @@
         if (ret != 0)
         {
             PVFS_perror_gossip("msgpairarray decode error", ret);
-            js_p->error_code = ret;
-            return 1;
+            msg_p->op_status = ret;
         }
+        else
+        {
+            /* if we've made it this far, the server response status is
+             * meaningful, so we save it.
+             */
+            msg_p->op_status = resp_p->status;
+        }
 
-        assert(resp_p->status <= 0);
-
-        /* if we've made it this far, the server response status is
-         * meaningful, so we save it.
-         */
-        msg_p->op_status = resp_p->status;
-
         /* NOTE: we call the function associated with each message,
          *       not just the one from the first array element.  so
          *       there could in theory be different functions for each
Index: pvfs2_src/src/client/sysint/client-state-machine.c
===================================================================
--- pvfs2_src/src/client/sysint/client-state-machine.c	(revision 1650)
+++ pvfs2_src/src/client/sysint/client-state-machine.c	(revision 1651)
@@ -67,23 +67,29 @@
   if the sm was added to the completion list, it MUST be removed
   before returning from test()
 */
-static int conditional_remove_sm_if_in_completion_list(
+static void conditional_remove_sm_if_in_completion_list(
     PINT_client_sm *sm_p)
 {
-    int found = 0, i = 0;
+    int i = 0;
 
     gen_mutex_lock(&s_completion_list_mutex);
     for(i = 0; i < s_completion_list_index; i++)
     {
         if (s_completion_list[i] == sm_p)
         {
-            s_completion_list[i] = NULL;
-            found = 1;
+            /* when we remove an operation from the completion array, just
+             * pull the last one up to take its place
+             */
+            s_completion_list[i] =
+                s_completion_list[s_completion_list_index - 1];
+            s_completion_list[s_completion_list_index-1] = NULL;
+            s_completion_list_index--;
             break;
         }
     }
+
     gen_mutex_unlock(&s_completion_list_mutex);
-    return found;
+    return;
 }
 
 static PVFS_error completion_list_retrieve_completed(
Index: pvfs2_src/src/common/misc/pvfs2-util.c
===================================================================
--- pvfs2_src/src/common/misc/pvfs2-util.c	(revision 1731)
+++ pvfs2_src/src/common/misc/pvfs2-util.c	(revision 1732)
@@ -266,28 +266,31 @@
      */
     for (i = 0; (i < file_count && !targetfile); i++)
     {
-        PINT_fstab_open(mnt_fp, file_list[i]);
-        if (mnt_fp)
+        if(file_list[i])
         {
-            while ((tmp_ent = PINT_fstab_next_entry(mnt_fp)))
+            PINT_fstab_open(mnt_fp, file_list[i]);
+            if (mnt_fp)
             {
-                if(!(PINT_FSTAB_NAME(tmp_ent)) || 
-                   !(strncmp(PINT_FSTAB_NAME(tmp_ent), "#", 1)))
+                while ((tmp_ent = PINT_fstab_next_entry(mnt_fp)))
                 {
-                    /* this entry is a comment */
+                    if(!(PINT_FSTAB_NAME(tmp_ent)) || 
+                       !(strncmp(PINT_FSTAB_NAME(tmp_ent), "#", 1)))
+                    {
+                        /* this entry is a comment */
+                        PINT_fstab_entry_destroy(tmp_ent);
+                        continue;
+                    }
+
+                    if (strcmp(PINT_FSTAB_TYPE(tmp_ent), "pvfs2") == 0)
+                    {
+                        targetfile = file_list[i];
+                        tmp_mntent_count++;
+                    }
+
                     PINT_fstab_entry_destroy(tmp_ent);
-                    continue;
                 }
-
-                if (strcmp(PINT_FSTAB_TYPE(tmp_ent), "pvfs2") == 0)
-                {
-                    targetfile = file_list[i];
-                    tmp_mntent_count++;
-                }
-
-                PINT_fstab_entry_destroy(tmp_ent);
+                PINT_fstab_close(mnt_fp);
             }
-            PINT_fstab_close(mnt_fp);
         }
     }
 
Index: pvfs2_src/src/client/sysint/remove.sm
===================================================================
--- pvfs2_src/src/client/sysint/remove.sm	(revision 1521)
+++ pvfs2_src/src/client/sysint/remove.sm	(revision 1522)
@@ -295,6 +295,7 @@
 static int remove_datafile_remove_failure(PINT_client_sm *sm_p,
 					  job_status_s *js_p)
 {
+    gossip_err("Error: failed removing one or more datafiles associated with the meta handle %llu\n", llu(sm_p->object_ref.handle));
     HANDLE_REMOVE_ERROR("datafile_remove_failure");
     return 1;
 }
@@ -315,6 +316,7 @@
     }
     else
     {
+        gossip_err("Error: failed removing handle %llu\n", llu(sm_p->object_ref.handle));
         HANDLE_REMOVE_ERROR("object_remove_failure");
     }
     return 1;
Index: pvfs2_src/src/client/sysint/sys-remove.sm
===================================================================
--- pvfs2_src/src/client/sysint/sys-remove.sm	(revision 1521)
+++ pvfs2_src/src/client/sysint/sys-remove.sm	(revision 1522)
@@ -507,8 +507,10 @@
 
     sm_p->u.remove.stored_error_code = js_p->error_code;
 
-    PVFS_perror_gossip("Failed to replace directory entry!",
-                       js_p->error_code);
+    gossip_err("Error: failed to replace directory during remove recovery: entry %s for object %llu.\n",
+        sm_p->u.remove.object_name,
+        llu(sm_p->object_ref.handle));
+    PVFS_perror_gossip("crdirent", js_p->error_code);
 
     PRINT_REMOVE_WARNING();
     return 1;
@@ -541,6 +543,9 @@
 	 * strand objects, or remove non-empty directories, for
 	 * example.
 	 */
+        gossip_err("Warning: Received ENOENT on retry to remove entry %s.\n",
+            sm_p->u.remove.object_name);
+
         PRINT_REMOVE_WARNING();
         js_p->error_code = 0;
         return 1;
_______________________________________________
Pvfs2-developers mailing list
Pvfs2-developers@beowulf-underground.org
http://www.beowulf-underground.org/mailman/listinfo/pvfs2-developers

Reply via email to