Before the change, IpcIoFile::WaitBeforePop() delayed both swap ins
(hits) and swap outs (misses). This is suboptimal because reads do
not usually accumulate unfinished I/O requests in OS buffers and,
hence, do not eventually require the OS to block all I/O.
Ideally, a disker should probably dequeue all pending disker requests,
satisfy reads ASAP, and then handle writes, but that is difficult for
several reasons. The patch implements a simpler approach: peek the
next request to be popped, and if it is a swap in (i.e., read or hit),
then pop it without any delay.
When a read is popped, we still adjust the balance member and LastIo,
because we do want to maintain the configured average I/O rate. When a
write request comes in, it will be delayed [longer] if needed.
In the extreme case of a very long stream of read requests (no writes
at all), there will be essentially no I/O rate limit and that is what
we want.
---
src/DiskIO/IpcIo/IpcIoFile.cc | 13 ++++++++-----
src/cf.data.pre | 8 ++++++--
src/ipc/Queue.cc | 15 ---------------
src/ipc/Queue.h | 24 ++++++++++++++++++++++--
4 files changed, 36 insertions(+), 24 deletions(-)
diff --git src/DiskIO/IpcIo/IpcIoFile.cc src/DiskIO/IpcIo/IpcIoFile.cc
index 337fdb9..c2845fe 100644
--- src/DiskIO/IpcIo/IpcIoFile.cc
+++ src/DiskIO/IpcIo/IpcIoFile.cc
@@ -654,64 +654,67 @@ diskerWrite(IpcIoMsg &ipcIo)
void
IpcIoFile::DiskerHandleMoreRequests(void *source)
{
debugs(47, 7, HERE << "resuming handling requests after " <<
static_cast<const char *>(source));
DiskerHandleMoreRequestsScheduled = false;
IpcIoFile::DiskerHandleRequests();
}
bool
IpcIoFile::WaitBeforePop()
{
const Ipc::QueueReader::Rate::Value ioRate = queue->localRateLimit();
const double maxRate = ioRate/1e3; // req/ms
// do we need to enforce configured I/O rate?
if (maxRate <= 0)
return false;
// is there an I/O request we could potentially delay?
- if (!queue->popReady()) {
- // unlike pop(), popReady() is not reliable and does not block reader
+ int processId;
+ IpcIoMsg ipcIo;
+ if (!queue->peek(processId, ipcIo)) {
+ // unlike pop(), peek() is not reliable and does not block reader
// so we must proceed with pop() even if it is likely to fail
return false;
}
static timeval LastIo = current_time;
const double ioDuration = 1.0 / maxRate; // ideal distance between two I/Os
// do not accumulate more than 100ms or 100 I/Os, whichever is smaller
const int64_t maxImbalance = min(static_cast<int64_t>(100), static_cast<int64_t>(100 * ioDuration));
const double credit = ioDuration; // what the last I/O should have cost us
const double debit = tvSubMsec(LastIo, current_time); // actual distance from the last I/O
LastIo = current_time;
Ipc::QueueReader::Balance &balance = queue->localBalance();
balance += static_cast<int64_t>(credit - debit);
debugs(47, 7, HERE << "rate limiting balance: " << balance << " after +" << credit << " -" << debit);
- if (balance > maxImbalance) {
- // if we accumulated too much time for future slow I/Os,
- // then shed accumulated time to keep just half of the excess
+ if (ipcIo.command == IpcIo::cmdWrite && balance > maxImbalance) {
+ // if the next request is (likely) write and we accumulated
+ // too much time for future slow I/Os, then shed accumulated
+ // time to keep just half of the excess
const int64_t toSpend = balance - maxImbalance/2;
if (toSpend/1e3 > Timeout)
debugs(47, DBG_IMPORTANT, "WARNING: Rock disker delays I/O " <<
"requests for " << (toSpend/1e3) << " seconds to obey " <<
ioRate << "/sec rate limit");
debugs(47, 3, HERE << "rate limiting by " << toSpend << " ms to get" <<
(1e3*maxRate) << "/sec rate");
eventAdd("IpcIoFile::DiskerHandleMoreRequests",
&IpcIoFile::DiskerHandleMoreRequests,
const_cast<char*>("rate limiting"),
toSpend/1e3, 0, false);
DiskerHandleMoreRequestsScheduled = true;
return true;
} else
if (balance < -maxImbalance) {
// do not owe "too much" to avoid "too large" bursts of I/O
balance = -maxImbalance;
}
diff --git src/cf.data.pre src/cf.data.pre
index 11d333e..a97bafb 100644
--- src/cf.data.pre
+++ src/cf.data.pre
@@ -2761,43 +2761,47 @@ DOC_START
The rock store type:
cache_dir rock Directory-Name Mbytes <max-size=bytes> [options]
The Rock Store type is a database-style storage. All cached
entries are stored in a "database" file, using fixed-size slots,
one entry per slot. The database size is specified in MB. The
slot size is specified in bytes using the max-size option. See
below for more info on the max-size option.
swap-timeout=msec: Squid will not start writing a miss to or
reading a hit from disk if it estimates that the swap operation
will take more than the specified number of milliseconds. By
default and when set to zero, disables the disk I/O time limit
enforcement. Ignored when using blocking I/O module because
blocking synchronous I/O does not allow Squid to estimate the
expected swap wait time.
max-swap-rate=swaps/sec: Artificially limits disk access using
- the specified I/O rate limit. Swap in and swap out requests that
+ the specified I/O rate limit. Swap out requests that
would cause the average I/O rate to exceed the limit are
- delayed. This is necessary on file systems that buffer "too
+ delayed. Individual swap in requests (i.e., hits or reads) are
+ not delayed, but they do contribute to measured swap rate and
+ since they are placed in the same FIFO queue as swap out
+ requests, they may wait longer if max-swap-rate is smaller.
+ This is necessary on file systems that buffer "too
many" writes and then start blocking Squid and other processes
while committing those writes to disk. Usually used together
with swap-timeout to avoid excessive delays and queue overflows
when disk demand exceeds available disk "bandwidth". By default
and when set to zero, disables the disk I/O rate limit
enforcement. Currently supported by IpcIo module only.
The coss store type:
NP: COSS filesystem in Squid-3 has been deemed too unstable for
production use and has thus been removed from this release.
We hope that it can be made usable again soon.
block-size=n defines the "block size" for COSS cache_dir's.
Squid uses file numbers as block numbers. Since file numbers
are limited to 24 bits, the block size determines the maximum
size of the COSS partition. The default is 512 bytes, which
leads to a maximum cache_dir size of 512<<24, or 8 GB. Note
you should not change the coss block size after Squid
diff --git src/ipc/Queue.cc src/ipc/Queue.cc
index 24e6706..c794a7d 100644
--- src/ipc/Queue.cc
+++ src/ipc/Queue.cc
@@ -216,55 +216,40 @@ const Ipc::QueueReader &
Ipc::FewToFewBiQueue::reader(const Group group, const int processId) const
{
return readers->theReaders[readerIndex(group, processId)];
}
void
Ipc::FewToFewBiQueue::clearReaderSignal(const int remoteProcessId)
{
QueueReader &localReader = reader(theLocalGroup, theLocalProcessId);
debugs(54, 7, HERE << "reader: " << localReader.id);
Must(validProcessId(remoteGroup(), remoteProcessId));
localReader.clearSignal();
// we got a hint; we could reposition iteration to try popping from the
// remoteProcessId queue first; but it does not seem to help much and might
// introduce some bias so we do not do that for now:
// theLastPopProcessId = remoteProcessId;
}
-bool
-Ipc::FewToFewBiQueue::popReady() const
-{
- // mimic FewToFewBiQueue::pop() but quit just before popping
- int popProcessId = theLastPopProcessId; // preserve for future pop()
- for (int i = 0; i < remoteGroupSize(); ++i) {
- if (++popProcessId >= remoteGroupIdOffset() + remoteGroupSize())
- popProcessId = remoteGroupIdOffset();
- const OneToOneUniQueue &queue = oneToOneQueue(remoteGroup(), popProcessId, theLocalGroup, theLocalProcessId);
- if (!queue.empty())
- return true;
- }
- return false; // most likely, no process had anything to pop
-}
-
Ipc::QueueReader::Balance &
Ipc::FewToFewBiQueue::localBalance()
{
QueueReader &r = reader(theLocalGroup, theLocalProcessId);
return r.balance;
}
Ipc::QueueReader::Rate &
Ipc::FewToFewBiQueue::localRateLimit()
{
QueueReader &r = reader(theLocalGroup, theLocalProcessId);
return r.rateLimit;
}
Ipc::FewToFewBiQueue::Metadata::Metadata(const int aGroupASize, const int aGroupAIdOffset, const int aGroupBSize, const int aGroupBIdOffset):
theGroupASize(aGroupASize), theGroupAIdOffset(aGroupAIdOffset),
theGroupBSize(aGroupBSize), theGroupBIdOffset(aGroupBIdOffset)
{
Must(theGroupASize > 0);
Must(theGroupBSize > 0);
diff --git src/ipc/Queue.h src/ipc/Queue.h
index 72fe3e5..56642ac 100644
--- src/ipc/Queue.h
+++ src/ipc/Queue.h
@@ -185,42 +185,42 @@ public:
enum Group { groupA = 0, groupB = 1 };
FewToFewBiQueue(const String &id, const Group aLocalGroup, const int aLocalProcessId);
Group localGroup() const { return theLocalGroup; }
Group remoteGroup() const { return theLocalGroup == groupA ? groupB : groupA; }
/// clears the reader notification received by the local process from the remote process
void clearReaderSignal(const int remoteProcessId);
/// picks a process and calls OneToOneUniQueue::pop() using its queue
template <class Value> bool pop(int &remoteProcessId, Value &value);
/// calls OneToOneUniQueue::push() using the given process queue
template <class Value> bool push(const int remoteProcessId, const Value &value);
/// finds the oldest item in incoming and outgoing queues between
/// us and the given remote process
template<class Value> bool findOldest(const int remoteProcessId, Value &value) const;
- /// returns true if pop() would have probably succeeded but does not pop()
- bool popReady() const;
+ /// peeks at the item likely to be pop()ed next
+ template<class Value> bool peek(int &remoteProcessId, Value &value) const;
/// returns local reader's balance
QueueReader::Balance &localBalance();
/// returns local reader's rate limit
QueueReader::Rate &localRateLimit();
private:
bool validProcessId(const Group group, const int processId) const;
int oneToOneQueueIndex(const Group fromGroup, const int fromProcessId, const Group toGroup, const int toProcessId) const;
const OneToOneUniQueue &oneToOneQueue(const Group fromGroup, const int fromProcessId, const Group toGroup, const int toProcessId) const;
OneToOneUniQueue &oneToOneQueue(const Group fromGroup, const int fromProcessId, const Group toGroup, const int toProcessId);
QueueReader &reader(const Group group, const int processId);
const QueueReader &reader(const Group group, const int processId) const;
int readerIndex(const Group group, const int processId) const;
int remoteGroupSize() const { return theLocalGroup == groupA ? metadata->theGroupBSize : metadata->theGroupASize; }
int remoteGroupIdOffset() const { return theLocalGroup == groupA ? metadata->theGroupBIdOffset : metadata->theGroupAIdOffset; }
private:
const Mem::Pointer<Metadata> metadata; ///< shared metadata
@@ -351,23 +351,43 @@ FewToFewBiQueue::push(const int remoteProcessId, const Value &value)
template <class Value>
bool
FewToFewBiQueue::findOldest(const int remoteProcessId, Value &value) const
{
// we may be called before remote process configured its queue end
if (!validProcessId(remoteGroup(), remoteProcessId))
return false;
// we need the oldest value, so start with the incoming, them-to-us queue:
const OneToOneUniQueue &inQueue = oneToOneQueue(remoteGroup(), remoteProcessId, theLocalGroup, theLocalProcessId);
debugs(54, 2, HERE << "peeking from " << remoteProcessId << " to " << theLocalProcessId << " at " << inQueue.size());
if (inQueue.peek(value))
return true;
// if the incoming queue is empty, check the outgoing, us-to-them queue:
const OneToOneUniQueue &outQueue = oneToOneQueue(theLocalGroup, theLocalProcessId, remoteGroup(), remoteProcessId);
debugs(54, 2, HERE << "peeking from " << theLocalProcessId << " to " << remoteProcessId << " at " << outQueue.size());
return outQueue.peek(value);
}
+template <class Value>
+bool
+FewToFewBiQueue::peek(int &remoteProcessId, Value &value) const
+{
+ // mimic FewToFewBiQueue::pop() but quit just before popping
+ int popProcessId = theLastPopProcessId; // preserve for future pop()
+ for (int i = 0; i < remoteGroupSize(); ++i) {
+ if (++popProcessId >= remoteGroupIdOffset() + remoteGroupSize())
+ popProcessId = remoteGroupIdOffset();
+ const OneToOneUniQueue &queue =
+ oneToOneQueue(remoteGroup(), popProcessId,
+ theLocalGroup, theLocalProcessId);
+ if (queue.peek(value)) {
+ remoteProcessId = popProcessId;
+ return true;
+ }
+ }
+ return false; // most likely, no process had anything to pop
+}
+
} // namespace Ipc
#endif // SQUID_IPC_QUEUE_H