On Dec 5, 2012, at 4:48 PM, Edgar Fuß <e...@math.uni-bonn.de> wrote:
>> The attached diff tries to coalesce writes to the journal in MAXPHYS >> sized and aligned blocks. > First, many thanks to hannken@ for ending my monthlong performance nightmare! > > Btw., this also explains my tstile lockups: writing the journal could take > tens of seconds; because the journal was locked, most actions on the fs would > stall for that time. > >> Comments or objections anyone? > I have a few questions. But I am by far no file system expert. So identifiers > confusing me may be evident for an expert or it may be just my ignorance that > some invariants are not obvious to me. > >> + daddr_t wl_buffer_addr; /* l: buffer base address */ > With "base address" I was thinking of a memory address, not a disc address. > Is it common practice to use xxx_addr for daddrs? > >> + size_t wl_buffer_len; /* l: buffer current use */ > "len" made me think of allocated length, not used length. > Again, that use may be common practice. > >> + * If not adjacent to buffered dat flush first. > s/dat/data/ > >> + pbn != wl->wl_buffer_addr + btodb(wl->wl_buffer_len)) { > Is it obvious to anyone but me why wl_buffer_addr is initialized at this > point? > I'm now sure it is, it just took me some time and a private mail to learn. > >> + KASSERT(resid > 0); > It took me even more time to learn why resid is non-negative here. > Would the explanation be worth a comment? > However, I'm still not sure why it can't be zero. > >> + error = wapbl_doio(wl->wl_buffer, wl->wl_buffer_len, >> + wl->wl_devvp, wl->wl_buffer_addr, B_WRITE); > I suppose it's on purpose not to use wapbl_write() here. Yes, wapbl_buffered_write is a helper like wapbl_write and there is no need to go through wapbl_write. An updated diff (names and comments) is attached. -- J. Hannken-Illjes - hann...@eis.cs.tu-bs.de - TU Braunschweig (Germany)
Index: vfs_wapbl.c =================================================================== RCS file: /cvsroot/src/sys/kern/vfs_wapbl.c,v retrieving revision 1.53 diff -p -u -2 -r1.53 vfs_wapbl.c --- vfs_wapbl.c 17 Nov 2012 10:10:17 -0000 1.53 +++ vfs_wapbl.c 5 Dec 2012 17:08:33 -0000 @@ -185,4 +185,8 @@ struct wapbl { SIMPLEQ_HEAD(, wapbl_entry) wl_entries; /* On disk transaction accounting */ + + u_char *wl_buffer; /* l: buffer for wapbl_buffered_write() */ + daddr_t wl_buffer_dblk; /* l: buffer disk block address */ + size_t wl_buffer_used; /* l: buffer current use */ }; @@ -490,4 +494,7 @@ wapbl_start(struct wapbl ** wlp, struct wl->wl_dealloclim); + wl->wl_buffer = wapbl_alloc(MAXPHYS); + wl->wl_buffer_used = 0; + wapbl_inodetrk_init(wl, WAPBL_INODETRK_SIZE); @@ -538,4 +545,5 @@ wapbl_start(struct wapbl ** wlp, struct wapbl_free(wl->wl_dealloclens, sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim); + wapbl_free(wl->wl_buffer, MAXPHYS); wapbl_inodetrk_free(wl); wapbl_free(wl, sizeof(*wl)); @@ -717,4 +725,5 @@ wapbl_stop(struct wapbl *wl, int force) wapbl_free(wl->wl_dealloclens, sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim); + wapbl_free(wl->wl_buffer, MAXPHYS); wapbl_inodetrk_free(wl); @@ -792,4 +801,78 @@ wapbl_read(void *data, size_t len, struc /* + * Flush buffered data if any. + */ +static int +wapbl_buffered_flush(struct wapbl *wl) +{ + int error; + + if (wl->wl_buffer_used == 0) + return 0; + + error = wapbl_doio(wl->wl_buffer, wl->wl_buffer_used, + wl->wl_devvp, wl->wl_buffer_dblk, B_WRITE); + wl->wl_buffer_used = 0; + + return error; +} + +/* + * Write data to the log. + * Try to coalesce writes and emit MAXPHYS aligned blocks. + */ +static int +wapbl_buffered_write(void *data, size_t len, struct wapbl *wl, daddr_t pbn) +{ + int error; + size_t resid; + + /* + * If not adjacent to buffered data flush first. Disk block + * address is always valid for non-empty buffer. + */ + if (wl->wl_buffer_used > 0 && + pbn != wl->wl_buffer_dblk + btodb(wl->wl_buffer_used)) { + error = wapbl_buffered_flush(wl); + if (error) + return error; + } + /* + * If this write goes to an empty buffer we have to + * save the disk block address first. + */ + if (wl->wl_buffer_used == 0) + wl->wl_buffer_dblk = pbn; + /* + * Remaining space so this buffer ends on a MAXPHYS boundary. + * + * Cannot become less or equal zero as the buffer would have been + * flushed on the last call then. + */ + resid = MAXPHYS - dbtob(wl->wl_buffer_dblk % btodb(MAXPHYS)) - + wl->wl_buffer_used; + KASSERT(resid > 0); + KASSERT(dbtob(btodb(resid)) == resid); + if (len >= resid) { + memcpy(wl->wl_buffer + wl->wl_buffer_used, data, resid); + wl->wl_buffer_used += resid; + error = wapbl_doio(wl->wl_buffer, wl->wl_buffer_used, + wl->wl_devvp, wl->wl_buffer_dblk, B_WRITE); + data = (uint8_t *)data + resid; + len -= resid; + wl->wl_buffer_dblk = pbn + btodb(resid); + wl->wl_buffer_used = 0; + if (error) + return error; + } + if (len > 0) { + memcpy(wl->wl_buffer + wl->wl_buffer_used, data, len); + wl->wl_buffer_used += len; + } + + return 0; +} + +/* * Off is byte offset returns new offset for next write * handles log wraparound @@ -814,5 +897,5 @@ wapbl_circ_write(struct wapbl *wl, void pbn = btodb(pbn << wl->wl_log_dev_bshift); #endif - error = wapbl_write(data, slen, wl->wl_devvp, pbn); + error = wapbl_buffered_write(data, slen, wl, pbn); if (error) return error; @@ -825,5 +908,5 @@ wapbl_circ_write(struct wapbl *wl, void pbn = btodb(pbn << wl->wl_log_dev_bshift); #endif - error = wapbl_write(data, len, wl->wl_devvp, pbn); + error = wapbl_buffered_write(data, len, wl, pbn); if (error) return error; @@ -1968,4 +2051,7 @@ wapbl_write_commit(struct wapbl *wl, off daddr_t pbn; + error = wapbl_buffered_flush(wl); + if (error) + return error; /* * flush disk cache to ensure that blocks we've written are actually @@ -1999,5 +2085,8 @@ wapbl_write_commit(struct wapbl *wl, off pbn = btodb(pbn << wc->wc_log_dev_bshift); #endif - error = wapbl_write(wc, wc->wc_len, wl->wl_devvp, pbn); + error = wapbl_buffered_write(wc, wc->wc_len, wl, pbn); + if (error) + return error; + error = wapbl_buffered_flush(wl); if (error) return error;