On 21/03/16 18:42, Jim Meyering wrote:
> On Mon, Mar 21, 2016 at 8:57 AM, Pádraig Brady <[email protected]> wrote:
>> I was wondering if we should not penalize all normal files,
>> to handle the /proc and /sys approximate st_size edge case?
>> I.E. only do the seek to last block of file + read() when st_size %
>> PAGE_SIZE == 0 ?
>>
>> I did a quick check of /proc and /sys sizes:
>>
>> $ find /proc/ /sys/ -type f -printf "%p\t%s\n" 2>/dev/null |
>>   tr ' ' _ | sort -k2,2n | uniq -f1 -c
>>
>>  121494 /proc/100/attr/current  0
> ...
>>   19610 /proc/bus/pci/00/1b.0   4096
> ...
>> In all cases where st_size % PAGE_SIZE != 0, it would be the same or better
>> to take the st_size.
>> In most cases it was the same. There are a couple of cases with larger sizes
>> where it would
>> have been better to take the st_size rather than reading, I.E.:
>>
>> /sys/devices/pci0000:00/0000:00:1a.0/usb3/3-1/3-1.4/descriptors
>>   st_size = 65553, available bytes = 195, wc -c = 65552
>> /sys/devices/pci0000:00/0000:00:02.0/rom
>>   st_size = 131072, available bytes = 0, wc -c = 127007 (EINVAL)
>> /sys/devices/pci0000:00/0000:00:02.0/resource0
>>   st_size = 4194304, available bytes = 0, wc -c = 4191231 (EIO)
>>
>> Also related to this is that we currently need read access to any file
>> we want to wc -c, whereas with this we wouldn't. Well we would for %
>> PAGE_SIZE files,
>> but we probably should fall back to st_size on EPERM in any case?
> 
> Good ideas. I agree and like the proposal.
> Not actually reading makes the results more conceptually consistent
> with the usual (non-special) case of files listed on the command line.

Patch attached.

cheers
>From f8bf4f8411ddf8878beb21cde2f5f60f8f94860f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?P=C3=A1draig=20Brady?= <[email protected]>
Date: Thu, 22 Dec 2016 14:31:44 +0000
Subject: [PATCH] wc: with only --bytes, determine size more efficiently

* src/wc.c (wc): Avoid reading the end of the file
when the size is not a multiple of PAGE_SIZE,
as the special case handling for files in /proc and /sys
is only required when st_size is 0 or a multiple of PAGE_SIZE.
* tests/misc/wc-proc.sh: Add a test case.
---
 src/wc.c              | 48 +++++++++++++++++++++++++++++++++++++-----------
 tests/misc/wc-proc.sh | 13 +++++++++++++
 2 files changed, 50 insertions(+), 11 deletions(-)

diff --git a/src/wc.c b/src/wc.c
index 64df50c..a02379b 100644
--- a/src/wc.c
+++ b/src/wc.c
@@ -71,6 +71,9 @@ static int number_width;
 /* True if we have ever read the standard input. */
 static bool have_read_stdin;
 
+/* Used to determine if file size can be determined without reading.  */
+static size_t page_size;
+
 /* The result of calling fstat or stat on a file descriptor or file.  */
 struct fstatus
 {
@@ -235,6 +238,8 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
 
   if (count_bytes && !count_chars && !print_lines && !count_complicated)
     {
+      bool skip_read = false;
+
       if (0 < fstatus->failed)
         fstatus->failed = fstat (fd, &fstatus->st);
 
@@ -245,24 +250,44 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
           && 0 <= fstatus->st.st_size)
         {
           size_t end_pos = fstatus->st.st_size;
-          off_t hi_pos = end_pos - end_pos % (ST_BLKSIZE (fstatus->st) + 1);
           if (current_pos < 0)
             current_pos = lseek (fd, 0, SEEK_CUR);
-          if (0 <= current_pos && current_pos < hi_pos
-              && 0 <= lseek (fd, hi_pos, SEEK_CUR))
-            bytes = hi_pos - current_pos;
+
+          if (end_pos % page_size)
+            {
+              /* We only need special handling of /proc and /sys files etc.
+                 when they're a multiple of PAGE_SIZE.  In the common case
+                 for files with st_size not a multiple of PAGE_SIZE,
+                 it's more efficient and accurate to use st_size.
+
+                 Be careful here.  The current position may actually be
+                 beyond the end of the file.  As in the example above.  */
+
+              bytes = end_pos < current_pos ? 0 : end_pos - current_pos;
+              skip_read = true;
+            }
+          else
+            {
+              off_t hi_pos = end_pos - end_pos % (ST_BLKSIZE (fstatus->st) + 1);
+              if (0 <= current_pos && current_pos < hi_pos
+                  && 0 <= lseek (fd, hi_pos, SEEK_CUR))
+                bytes = hi_pos - current_pos;
+            }
         }
 
-      fdadvise (fd, 0, 0, FADVISE_SEQUENTIAL);
-      while ((bytes_read = safe_read (fd, buf, BUFFER_SIZE)) > 0)
+      if (! skip_read)
         {
-          if (bytes_read == SAFE_READ_ERROR)
+          fdadvise (fd, 0, 0, FADVISE_SEQUENTIAL);
+          while ((bytes_read = safe_read (fd, buf, BUFFER_SIZE)) > 0)
             {
-              error (0, errno, "%s", quotef (file));
-              ok = false;
-              break;
+              if (bytes_read == SAFE_READ_ERROR)
+                {
+                  error (0, errno, "%s", quotef (file));
+                  ok = false;
+                  break;
+                }
+              bytes += bytes_read;
             }
-          bytes += bytes_read;
         }
     }
   else if (!count_chars && !count_complicated)
@@ -639,6 +664,7 @@ main (int argc, char **argv)
 
   atexit (close_stdout);
 
+  page_size = getpagesize ();
   /* Line buffer stdout to ensure lines are written atomically and immediately
      so that processes running in parallel do not intersperse their output.  */
   setvbuf (stdout, NULL, _IOLBF, 0);
diff --git a/tests/misc/wc-proc.sh b/tests/misc/wc-proc.sh
index d6a36ba..eb78f09 100755
--- a/tests/misc/wc-proc.sh
+++ b/tests/misc/wc-proc.sh
@@ -19,6 +19,7 @@
 . "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
 print_ver_ wc
 
+# Ensure we read() /proc files to determine content length
 for file in /proc/version /sys/kernel/profiling; do
   if test -r $file; then
     cp -f $file copy &&
@@ -29,4 +30,16 @@ for file in /proc/version /sys/kernel/profiling; do
   fi
 done
 
+# Ensure we handle cases where don't read()
+truncate -s 2 no_read || framework_failure_
+# read() used when multiple of page size
+truncate -s 1048576 do_read || framework_failure_
+wc -c no_read do_read > out || fail=1
+cat <<\EOF > exp
+      2 no_read
+1048576 do_read
+1048578 total
+EOF
+compare exp out || fail=1
+
 Exit $fail
-- 
2.5.5

Reply via email to