Re: freezero(3) for stdio's internal buffers

2017-11-30 Thread kshe
On Tue, 28 Nov 2017 05:52:25 +, Theo de Raadt wrote:
> You could compare make build times.  I suspect this will slightly
> exceed the noise floor.

Without the patch, my fastest laptop (amd64, 8-core i7 @ 3.4 GHz)
managed to

$ make obj && make -j8 build

in exactly 2359 seconds; with the patch applied, the same command
happened to perform (insignificantly) faster, finishing in 2352 seconds.
These were done on an otherwise completely idle system (not even cron(8)
was running) and as the first executed command after a full reboot, with
both /usr/src and /usr/obj mounted as mfs.

As I was expecting, the performance impact of this diff appears to have
been overestimated.  A slightly improved version follows; further
feedback is of course welcome.

Index: asprintf.c
===
RCS file: /cvs/src/lib/libc/stdio/asprintf.c,v
retrieving revision 1.25
diff -u -p -r1.25 asprintf.c
--- asprintf.c  17 Mar 2017 14:53:08 -  1.25
+++ asprintf.c  30 Nov 2017 04:05:21 -
@@ -61,7 +61,7 @@ asprintf(char **str, const char *fmt, ..
return (ret);
 
 err:
-   free(f._bf._base);
+   freezero(f._bf._base, f._bf._size);
f._bf._base = NULL;
*str = NULL;
errno = ENOMEM;
Index: fclose.c
===
RCS file: /cvs/src/lib/libc/stdio/fclose.c,v
retrieving revision 1.10
diff -u -p -r1.10 fclose.c
--- fclose.c31 Aug 2015 02:53:57 -  1.10
+++ fclose.c30 Nov 2017 04:05:21 -
@@ -51,7 +51,7 @@ fclose(FILE *fp)
if (fp->_close != NULL && (*fp->_close)(fp->_cookie) < 0)
r = EOF;
if (fp->_flags & __SMBF)
-   free((char *)fp->_bf._base);
+   freezero(fp->_bf._base, fp->_bf._size);
if (HASUB(fp))
FREEUB(fp);
if (HASLB(fp))
Index: fmemopen.c
===
RCS file: /cvs/src/lib/libc/stdio/fmemopen.c,v
retrieving revision 1.3
diff -u -p -r1.3 fmemopen.c
--- fmemopen.c  31 Aug 2015 02:53:57 -  1.3
+++ fmemopen.c  30 Nov 2017 04:05:21 -
@@ -107,7 +107,7 @@ fmemopen_close_free(void *v)
 {
struct state*st = v;
 
-   free(st->string);
+   freezero(st->string, st->size);
free(st);
 
return (0);
Index: freopen.c
===
RCS file: /cvs/src/lib/libc/stdio/freopen.c,v
retrieving revision 1.16
diff -u -p -r1.16 freopen.c
--- freopen.c   21 Sep 2016 04:38:56 -  1.16
+++ freopen.c   30 Nov 2017 04:05:21 -
@@ -106,7 +106,7 @@ freopen(const char *file, const char *mo
if (isopen && f != wantfd)
(void) (*fp->_close)(fp->_cookie);
if (fp->_flags & __SMBF)
-   free((char *)fp->_bf._base);
+   freezero(fp->_bf._base, fp->_bf._size);
fp->_w = 0;
fp->_r = 0;
fp->_p = NULL;
Index: local.h
===
RCS file: /cvs/src/lib/libc/stdio/local.h,v
retrieving revision 1.25
diff -u -p -r1.25 local.h
--- local.h 23 May 2016 00:21:48 -  1.25
+++ local.h 30 Nov 2017 04:05:21 -
@@ -82,7 +82,7 @@ __END_HIDDEN_DECLS
 #defineHASUB(fp) (_UB(fp)._base != NULL)
 #defineFREEUB(fp) { \
if (_UB(fp)._base != (fp)->_ubuf) \
-   free(_UB(fp)._base); \
+   freezero(_UB(fp)._base, _UB(fp)._size); \
_UB(fp)._base = NULL; \
 }
 
@@ -91,7 +91,7 @@ __END_HIDDEN_DECLS
  */
 #defineHASLB(fp) ((fp)->_lb._base != NULL)
 #defineFREELB(fp) { \
-   free((char *)(fp)->_lb._base); \
+   freezero((fp)->_lb._base, (fp)->_lb._size); \
(fp)->_lb._base = NULL; \
 }
 
Index: setvbuf.c
===
RCS file: /cvs/src/lib/libc/stdio/setvbuf.c,v
retrieving revision 1.14
diff -u -p -r1.14 setvbuf.c
--- setvbuf.c   21 Sep 2016 04:38:56 -  1.14
+++ setvbuf.c   30 Nov 2017 04:05:21 -
@@ -70,7 +70,7 @@ setvbuf(FILE *fp, char *buf, int mode, s
fp->_r = fp->_lbfsize = 0;
flags = fp->_flags;
if (flags & __SMBF)
-   free(fp->_bf._base);
+   freezero(fp->_bf._base, fp->_bf._size);
flags &= ~(__SLBF | __SNBF | __SMBF | __SOPT | __SNPT | __SEOF);
 
/* If setting unbuffered mode, skip all the hard work. */
Index: vasprintf.c
===
RCS file: /cvs/src/lib/libc/stdio/vasprintf.c,v
retrieving revision 1.22
diff -u -p -r1.22 vasprintf.c
--- vasprintf.c 17 Mar 2017 14:53:08 -  1.22
+++ vasprintf.c 30 Nov 2017 04:05:21 -
@@ -57,7 +57,7 @@ vasprintf(char **str, const char *fmt, _
return (ret);
 
 err:
-   free(f._bf._base);
+   freezero(f._bf._base, f._bf._size);
f._bf._base = NULL;
*str = NULL;
errno = ENOM

Re: freezero(3) for stdio's internal buffers

2017-11-27 Thread kshe
On Tue, 28 Nov 2017 05:52:25 +, Theo de Raadt wrote:
> > In fact, can recallocarray be faster than plain free followed by calloc?
>
> Yes.
>
> I think you are missing some nuances.  These added functions have fast paths
> and slow paths.  freezero() isn't just a bzero, it also has munmap()
> sequences.  You are adding forced bzero or munmap() in circumstances
> where previously a malloc implementation could keep the freed'd memory in
> a cache for reuse.

Hence programs willing to avoid leaking secrets to such cache cannot
safely use stdio functions to manipulate those secrets.  But, sure, this
may very well be acceptable as, after all, stdio is merely a library
provided for convenience, and programs needing complete control over
their memory could use their own wrappers around raw system calls
instead.

> I find it hard to read your diff as convert other than "convert all free
> operations to freezero", or always pay the cost no matter what.

I indeed took a systematic approach, so perhaps the bits in vfprintf.c
and vfwprintf.c are in fact overkill, and they could be left out.
However, this does not invalidate the other parts of my first patch,
many of which simply mirror the already established use of recallocarray
on the same buffers.

Regards,

kshe



Re: freezero(3) for stdio's internal buffers

2017-11-27 Thread Theo de Raadt
> In fact, can recallocarray be faster than plain free followed by calloc?

Yes.

I think you are missing some nuances.  These added functions have fast paths
and slow paths.  freezero() isn't just a bzero, it also has munmap()
sequences.  You are adding forced bzero or munmap() in circumstances
where previously a malloc implementation could keep the freed'd memory in
a cache for reuse.

I find it hard to read your diff as convert other than "convert all free
operations to freezero", or always pay the cost no matter what.

You could compare make build times.  I suspect this will slightly
exceed the noise floor.



Re: freezero(3) for stdio's internal buffers

2017-11-27 Thread kshe
On Mon, 27 Nov 2017 08:01:25 +, Otto Moerbeek wrote:
> On Mon, Nov 27, 2017 at 05:48:14AM +, kshe wrote:
> > On Mon, 27 Nov 2017 00:42:01 +, Theo de Raadt wrote:
> > > This needs worst-case performance measurements.
> >
> > The only instances where performance could be a problem are in
> > vfprintf.c and vfwprintf.c, where the calls happen inside a loop; but
> > for these, in fact, the best solution would be to use recallocarray
> > directly: rather than repeatedly freeing (and clearing) `convbuf' and
> > reallocating a fresh one every time it is needed, it should be kept
> > around and passed to __wcsconv() and __mbsconv(), along with some
> > accounting of its current size, so that these functions could then call
> > recallocarray appropriately.  However, this optimisation would be more
> > intrusive than the diff I submitted, and would therefore demand greater
> > familiarity with stdio's source, which I have not yet acquired.
> >
> > That being said, in all other cases, since the way stdio functions fill
> > their buffers is much more costly than simply writing a bunch of zeros
> > in sequence (or merely calling munmap(2)), no real slowdown is to be
> > expected.  I should also point out that recallocarray is currently used
> > inside several loops in fvwrite.c, fgetln.c and getdelim.c, but no one
> > complained that the affected functions became too slow, because doing
> > things, as stdio does, like reading from and writing to disk or decoding
> > and converting user input will always dominate the effect of a few calls
> > to discard temporary buffers.
> >
> > Regards,
> >
> > kshe
>
> I was thinking: only point against your statement is that there could
> be cases where a very large buffer is used, but only a small part of
> it is used. In that case, clearing the buffer is more costly. Luckily,
> in those cases malloc just calls munmap(2) without clearing.
>
> Still, I would like to see benchmarks to validate assumptions.

Well, I looked under regress/lib/libc/, but nothing here seems to test
for pathological instances where performance is known to be an issue.  I
would happily run benchmarks if someone could provide examples of such
cases.

In the mean time, here is a better diff for vfprintf.c and vfwprintf.c,
which should eliminate all potential performance regressions,
implementing the optimisation described in my previous message.  In
fact, can recallocarray be faster than plain free followed by calloc?
If so, in the case of vfwprintf.c, this patch may actually improve
performance.  At least, it shall not degrade it.

Nevertheless, this needs to be reviewed carefully since, as stated
above, I do not have much familiarity with stdio's intricate code.

Index: vfprintf.c
===
RCS file: /cvs/src/lib/libc/stdio/vfprintf.c,v
retrieving revision 1.77
diff -u -p -r1.77 vfprintf.c
--- vfprintf.c  29 Aug 2016 12:20:57 -  1.77
+++ vfprintf.c  28 Nov 2017 04:06:43 -
@@ -153,7 +153,7 @@ __sbprintf(FILE *fp, const char *fmt, va
  * string is null-terminated.
  */
 static char *
-__wcsconv(wchar_t *wcsarg, int prec)
+__wcsconv(wchar_t *wcsarg, int prec, char *oconvbuf, size_t *convbuf_size)
 {
mbstate_t mbs;
char buf[MB_LEN_MAX];
@@ -191,18 +191,19 @@ __wcsconv(wchar_t *wcsarg, int prec)
return (NULL);
}
}
-   if ((convbuf = malloc(nbytes + 1)) == NULL)
+   convbuf = recallocarray(oconvbuf, *convbuf_size, nbytes + 1, 1);
+   if (convbuf == NULL)
return (NULL);
+   *convbuf_size = nbytes + 1;
 
/* Fill the output buffer. */
p = wcsarg;
memset(&mbs, 0, sizeof(mbs));
-   if ((nbytes = wcsrtombs(convbuf, (const wchar_t **)&p,
-   nbytes, &mbs)) == (size_t)-1) {
-   free(convbuf);
+   nbytes = wcsrtombs(convbuf, (const wchar_t **)&p, nbytes, &mbs);
+   if (nbytes == (size_t)-1) {
+   freezero(convbuf, *convbuf_size);
return (NULL);
}
-   convbuf[nbytes] = '\0';
return (convbuf);
 }
 #endif
@@ -330,6 +331,7 @@ __vfprintf(FILE *fp, const char *fmt0, _
va_list orgap;  /* original argument pointer */
 #ifdef PRINTF_WIDE_CHAR
char *convbuf;  /* buffer for wide to multi-byte conversion */
+   size_t convbuf_size;
 #endif
 
/*
@@ -857,8 +859,6 @@ fp_common:
if (flags & LONGINT) {
wchar_t *wcp;
 
-   free(convbuf);
-   convbuf = NULL;
if ((wcp = GETARG(wchar_t *)) == NULL) {
struct syslog_data sdata = 
SYSLOG_DATA_INIT;
int save_errno = errno;
@@ -867,9 +867,13 @@ fp_common:
"vfprintf %%ls NULL in \"%s\"", 
fmt0);

Re: freezero(3) for stdio's internal buffers

2017-11-27 Thread Otto Moerbeek
On Mon, Nov 27, 2017 at 05:48:14AM +, kshe wrote:

> On Mon, 27 Nov 2017 00:42:01 +, Theo de Raadt wrote:
> > This needs worst-case performance measurements.
> 
> The only instances where performance could be a problem are in
> vfprintf.c and vfwprintf.c, where the calls happen inside a loop; but
> for these, in fact, the best solution would be to use recallocarray
> directly: rather than repeatedly freeing (and clearing) `convbuf' and
> reallocating a fresh one every time it is needed, it should be kept
> around and passed to __wcsconv() and __mbsconv(), along with some
> accounting of its current size, so that these functions could then call
> recallocarray appropriately.  However, this optimisation would be more
> intrusive than the diff I submitted, and would therefore demand greater
> familiarity with stdio's source, which I have not yet acquired.
> 
> That being said, in all other cases, since the way stdio functions fill
> their buffers is much more costly than simply writing a bunch of zeros
> in sequence (or merely calling munmap(2)), no real slowdown is to be
> expected.  I should also point out that recallocarray is currently used
> inside several loops in fvwrite.c, fgetln.c and getdelim.c, but no one
> complained that the affected functions became too slow, because doing
> things, as stdio does, like reading from and writing to disk or decoding
> and converting user input will always dominate the effect of a few calls
> to discard temporary buffers.
> 
> Regards,
> 
> kshe

I was thinking: only point against your statement is that there could
be cases where a very large buffer is used, but only a small part of
it is used. In that case, clearing the buffer is more costly. Luckily,
in those cases malloc just calls munmap(2) without clearing. 

Still, I would like to see benchmarks to validate assumptions.

-Otto



Re: freezero(3) for stdio's internal buffers

2017-11-26 Thread kshe
On Mon, 27 Nov 2017 00:42:01 +, Theo de Raadt wrote:
> This needs worst-case performance measurements.

The only instances where performance could be a problem are in
vfprintf.c and vfwprintf.c, where the calls happen inside a loop; but
for these, in fact, the best solution would be to use recallocarray
directly: rather than repeatedly freeing (and clearing) `convbuf' and
reallocating a fresh one every time it is needed, it should be kept
around and passed to __wcsconv() and __mbsconv(), along with some
accounting of its current size, so that these functions could then call
recallocarray appropriately.  However, this optimisation would be more
intrusive than the diff I submitted, and would therefore demand greater
familiarity with stdio's source, which I have not yet acquired.

That being said, in all other cases, since the way stdio functions fill
their buffers is much more costly than simply writing a bunch of zeros
in sequence (or merely calling munmap(2)), no real slowdown is to be
expected.  I should also point out that recallocarray is currently used
inside several loops in fvwrite.c, fgetln.c and getdelim.c, but no one
complained that the affected functions became too slow, because doing
things, as stdio does, like reading from and writing to disk or decoding
and converting user input will always dominate the effect of a few calls
to discard temporary buffers.

Regards,

kshe



Re: freezero(3) for stdio's internal buffers

2017-11-26 Thread Theo de Raadt
This needs worst-case performance measurements.



Re: freezero(3) for stdio's internal buffers

2017-11-26 Thread kshe
On Sun, 26 Nov 2017 19:56:03 +, kshe wrote:
> Hi,
>
> Shortly after recallocarray(3) was introduced, it was put into use for
> several objects internal to the stdio library "to avoid leaving detritus
> in memory when resizing buffers".  However, in the end, this memory is
> still released by plain free(3) calls.
>
> The same reason that motivated the change to recallocarray(3) should
> also entail that to freezero(3), where applicable.  Currently, a
> suitable overflow from a malloc(3)ed buffer could allow an attacker to
> retrieve lines previously read by fgetln(3), as well as data previously
> written using the fprintf(3) family of functions, even long after the
> corresponding streams have been closed and even if the programmer was
> very careful explicitly to discard all the manually allocated objects
> that could have contained sensitive data.  The diff below makes such
> attacks much less likely to succeed, so that one can read and write
> secrets to files with stdio functions more safely, id est without the
> undesirable side effect of leaving parts of those secrets in heap memory
> afterwards.

The diff previously attached was one of my early drafts where I only
marked the relevant calls.  Please ignore it.  Here is the real patch I
intended to send.

Index: asprintf.c
===
RCS file: /cvs/src/lib/libc/stdio/asprintf.c,v
retrieving revision 1.25
diff -u -p -r1.25 asprintf.c
--- asprintf.c  17 Mar 2017 14:53:08 -  1.25
+++ asprintf.c  26 Oct 2017 23:30:57 -
@@ -61,7 +61,7 @@ asprintf(char **str, const char *fmt, ..
return (ret);
 
 err:
-   free(f._bf._base);
+   freezero(f._bf._base, f._bf._size);
f._bf._base = NULL;
*str = NULL;
errno = ENOMEM;
Index: fclose.c
===
RCS file: /cvs/src/lib/libc/stdio/fclose.c,v
retrieving revision 1.10
diff -u -p -r1.10 fclose.c
--- fclose.c31 Aug 2015 02:53:57 -  1.10
+++ fclose.c26 Oct 2017 23:30:57 -
@@ -51,7 +51,7 @@ fclose(FILE *fp)
if (fp->_close != NULL && (*fp->_close)(fp->_cookie) < 0)
r = EOF;
if (fp->_flags & __SMBF)
-   free((char *)fp->_bf._base);
+   freezero(fp->_bf._base, fp->_bf._size);
if (HASUB(fp))
FREEUB(fp);
if (HASLB(fp))
Index: fmemopen.c
===
RCS file: /cvs/src/lib/libc/stdio/fmemopen.c,v
retrieving revision 1.3
diff -u -p -r1.3 fmemopen.c
--- fmemopen.c  31 Aug 2015 02:53:57 -  1.3
+++ fmemopen.c  26 Oct 2017 23:30:57 -
@@ -107,7 +107,7 @@ fmemopen_close_free(void *v)
 {
struct state*st = v;
 
-   free(st->string);
+   freezero(st->string, st->size);
free(st);
 
return (0);
Index: freopen.c
===
RCS file: /cvs/src/lib/libc/stdio/freopen.c,v
retrieving revision 1.16
diff -u -p -r1.16 freopen.c
--- freopen.c   21 Sep 2016 04:38:56 -  1.16
+++ freopen.c   26 Oct 2017 23:30:57 -
@@ -106,7 +106,7 @@ freopen(const char *file, const char *mo
if (isopen && f != wantfd)
(void) (*fp->_close)(fp->_cookie);
if (fp->_flags & __SMBF)
-   free((char *)fp->_bf._base);
+   freezero(fp->_bf._base, fp->_bf._size);
fp->_w = 0;
fp->_r = 0;
fp->_p = NULL;
Index: local.h
===
RCS file: /cvs/src/lib/libc/stdio/local.h,v
retrieving revision 1.25
diff -u -p -r1.25 local.h
--- local.h 23 May 2016 00:21:48 -  1.25
+++ local.h 26 Oct 2017 23:30:57 -
@@ -82,7 +82,7 @@ __END_HIDDEN_DECLS
 #defineHASUB(fp) (_UB(fp)._base != NULL)
 #defineFREEUB(fp) { \
if (_UB(fp)._base != (fp)->_ubuf) \
-   free(_UB(fp)._base); \
+   freezero(_UB(fp)._base, _UB(fp)._size); \
_UB(fp)._base = NULL; \
 }
 
@@ -91,7 +91,7 @@ __END_HIDDEN_DECLS
  */
 #defineHASLB(fp) ((fp)->_lb._base != NULL)
 #defineFREELB(fp) { \
-   free((char *)(fp)->_lb._base); \
+   freezero((fp)->_lb._base, (fp)->_lb._size); \
(fp)->_lb._base = NULL; \
 }
 
Index: setvbuf.c
===
RCS file: /cvs/src/lib/libc/stdio/setvbuf.c,v
retrieving revision 1.14
diff -u -p -r1.14 setvbuf.c
--- setvbuf.c   21 Sep 2016 04:38:56 -  1.14
+++ setvbuf.c   26 Oct 2017 23:30:57 -
@@ -70,7 +70,7 @@ setvbuf(FILE *fp, char *buf, int mode, s
fp->_r = fp->_lbfsize = 0;
flags = fp->_flags;
if (flags & __SMBF)
-   free(fp->_bf._base);
+   freezero(fp->_bf._base, fp->_bf._size);
flags &= ~(__SLBF | __SNBF | __SMBF | __SOPT | __SNPT | __SEOF);
 
/* If setting unbuffered mode, skip all the hard work. *