Re: uvm: free sizes diff

2017-01-31 Thread Stefan Kempf
David Hill wrote:
> Hello -
> 
> The following diff adds free sizes to free() calls in uvm/.  Only one
> remaining in uvm/.
 
ok stefan@

> Index: uvm/uvm_amap.c
> ===
> RCS file: /cvs/src/sys/uvm/uvm_amap.c,v
> retrieving revision 1.78
> diff -u -p -r1.78 uvm_amap.c
> --- uvm/uvm_amap.c8 Oct 2016 16:19:44 -   1.78
> +++ uvm/uvm_amap.c30 Jan 2017 21:17:22 -
> @@ -368,7 +368,7 @@ amap_alloc1(int slots, int waitf, int la
>   return(amap);
>  
>  fail1:
> - free(amap->am_buckets, M_UVMAMAP, 0);
> + free(amap->am_buckets, M_UVMAMAP, buckets * sizeof(*amap->am_buckets));
>   TAILQ_FOREACH_SAFE(chunk, >am_chunks, ac_list, tmp)
>   pool_put(_amap_chunk_pool, chunk);
>   pool_put(_amap_pool, amap);
> @@ -414,7 +414,7 @@ amap_free(struct vm_amap *amap)
>  
>  #ifdef UVM_AMAP_PPREF
>   if (amap->am_ppref && amap->am_ppref != PPREF_NONE)
> - free(amap->am_ppref, M_UVMAMAP, 0);
> + free(amap->am_ppref, M_UVMAMAP, amap->am_nslot * sizeof(int));
>  #endif
>  
>   if (UVM_AMAP_SMALL(amap))
> Index: uvm/uvm_aobj.c
> ===
> RCS file: /cvs/src/sys/uvm/uvm_aobj.c,v
> retrieving revision 1.84
> diff -u -p -r1.84 uvm_aobj.c
> --- uvm/uvm_aobj.c24 Sep 2016 18:40:29 -  1.84
> +++ uvm/uvm_aobj.c30 Jan 2017 21:17:22 -
> @@ -403,7 +403,7 @@ uao_free(struct uvm_aobj *aobj)
>   uvmexp.swpgonly--;
>   }
>   }
> - free(aobj->u_swslots, M_UVMAOBJ, 0);
> + free(aobj->u_swslots, M_UVMAOBJ, aobj->u_pages * sizeof(int));
>   }
>  
>   /* finally free the aobj itself */
> @@ -532,7 +532,7 @@ uao_shrink_array(struct uvm_object *uobj
>   for (i = 0; i < pages; i++)
>   new_swslots[i] = aobj->u_swslots[i];
>  
> - free(aobj->u_swslots, M_UVMAOBJ, 0);
> + free(aobj->u_swslots, M_UVMAOBJ, aobj->u_pages * sizeof(int));
>  
>   aobj->u_swslots = new_swslots;
>   aobj->u_pages = pages;
> @@ -585,7 +585,7 @@ uao_grow_array(struct uvm_object *uobj, 
>   for (i = 0; i < aobj->u_pages; i++)
>   new_swslots[i] = aobj->u_swslots[i];
>  
> - free(aobj->u_swslots, M_UVMAOBJ, 0);
> + free(aobj->u_swslots, M_UVMAOBJ, aobj->u_pages * sizeof(int));
>  
>   aobj->u_swslots = new_swslots;
>   aobj->u_pages = pages;
> @@ -664,7 +664,7 @@ uao_grow_convert(struct uvm_object *uobj
>   }
>   }
>  
> - free(old_swslots, M_UVMAOBJ, 0);
> + free(old_swslots, M_UVMAOBJ, aobj->u_pages * sizeof(int));
>   aobj->u_pages = pages;
>  
>   return 0;
> 



Re: clang and -Werror vs -Wpointer-sign

2017-01-30 Thread Stefan Kempf
Jonathan Gray wrote:
> Base gcc4 changes the defaults to set -Wno-pointer-sign.
> Base clang does not, I'm not sure where in the llvm code to do so.
> Base gcc3 does not handle -Wno-pointer-sign.
 
I think this should turn off -Wpointer-sign off by default.
Passing -Wpointer-sign on the command line enables it.
Not suitable for upstreaming though.

Index: gnu/llvm/tools/clang//include/clang/Basic/DiagnosticSemaKinds.td
===
RCS file: 
/cvs/src/gnu/llvm/tools/clang/include/clang/Basic/DiagnosticSemaKinds.td,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 DiagnosticSemaKinds.td
--- gnu/llvm/tools/clang//include/clang/Basic/DiagnosticSemaKinds.td14 Jan 
2017 19:55:48 -  1.1.1.2
+++ gnu/llvm/tools/clang//include/clang/Basic/DiagnosticSemaKinds.td30 Jan 
2017 18:22:22 -
@@ -6332,7 +6332,7 @@ def ext_typecheck_convert_incompatible_p
   "sending to parameter of different type}0,1"
   "|%diff{casting $ to type $|casting between types}0,1}2"
   " converts between pointers to integer types with different sign">,
-  InGroup>;
+  InGroup>, DefaultIgnore;
 def ext_typecheck_convert_incompatible_pointer : ExtWarn<
   "incompatible pointer types "
   "%select{%diff{assigning to $ from $|assigning to different types}0,1"
 
> Below is a patch to add -Wno-pointer-sign to places that use use
> -Werror and trigger -Wpointer-sign warnings which breaks the build
> when building with clang.  Based on an earlier patch from patrick@
> 
> Though really the default gcc4 and clang behaviour should be the
> same one way or the other.
>
> Index: lib/libcrypto/Makefile
> ===
> RCS file: /cvs/src/lib/libcrypto/Makefile,v
> retrieving revision 1.14
> diff -u -p -r1.14 Makefile
> --- lib/libcrypto/Makefile21 Jan 2017 09:38:58 -  1.14
> +++ lib/libcrypto/Makefile29 Jan 2017 05:10:50 -
> @@ -14,6 +14,9 @@ CLEANFILES=${PC_FILES} ${VERSION_SCRIPT}
>  LCRYPTO_SRC= ${.CURDIR}
>  
>  CFLAGS+= -Wall -Wundef -Werror
> +.if ${COMPILER_VERSION:L} != "gcc3"
> +CFLAGS+= -Wno-pointer-sign
> +.endif
>  
>  .if !defined(NOPIC)
>  CFLAGS+= -DDSO_DLFCN -DHAVE_DLFCN_H -DHAVE_FUNOPEN
> Index: lib/librthread/Makefile
> ===
> RCS file: /cvs/src/lib/librthread/Makefile,v
> retrieving revision 1.43
> diff -u -p -r1.43 Makefile
> --- lib/librthread/Makefile   1 Jun 2016 04:34:18 -   1.43
> +++ lib/librthread/Makefile   29 Jan 2017 05:27:29 -
> @@ -1,11 +1,16 @@
>  #$OpenBSD: Makefile,v 1.43 2016/06/01 04:34:18 tedu Exp $
>  
> +.include 
> +
>  LIB=pthread
>  LIBCSRCDIR=  ${.CURDIR}/../libc
>  
>  CFLAGS+=-Wall -g -Werror -Wshadow
>  CFLAGS+=-Werror-implicit-function-declaration
>  CFLAGS+=-Wsign-compare
> +.if ${COMPILER_VERSION:L} != "gcc3"
> +CFLAGS+= -Wno-pointer-sign
> +.endif
>  CFLAGS+=-I${.CURDIR} -include namespace.h \
>   -I${LIBCSRCDIR}/arch/${MACHINE_CPU} -I${LIBCSRCDIR}/include
>  CDIAGFLAGS=
> Index: lib/libtls/Makefile
> ===
> RCS file: /cvs/src/lib/libtls/Makefile,v
> retrieving revision 1.30
> diff -u -p -r1.30 Makefile
> --- lib/libtls/Makefile   25 Jan 2017 23:53:18 -  1.30
> +++ lib/libtls/Makefile   29 Jan 2017 05:32:43 -
> @@ -6,6 +6,9 @@ SUBDIR=   man
>  .endif
>  
>  CFLAGS+= -Wall -Werror -Wimplicit
> +.if ${COMPILER_VERSION:L} != "gcc3"
> +CFLAGS+= -Wno-pointer-sign
> +.endif
>  CFLAGS+= -DLIBRESSL_INTERNAL
>  
>  CLEANFILES= ${VERSION_SCRIPT}
> Index: usr.sbin/ocspcheck/Makefile
> ===
> RCS file: /cvs/src/usr.sbin/ocspcheck/Makefile,v
> retrieving revision 1.2
> diff -u -p -r1.2 Makefile
> --- usr.sbin/ocspcheck/Makefile   24 Jan 2017 09:25:27 -  1.2
> +++ usr.sbin/ocspcheck/Makefile   29 Jan 2017 05:27:10 -
> @@ -1,5 +1,7 @@
>  #$OpenBSD: Makefile,v 1.2 2017/01/24 09:25:27 deraadt Exp $
>  
> +.include 
> +
>  PROG=ocspcheck
>  MAN= ocspcheck.8
>  
> @@ -15,6 +17,9 @@ CFLAGS+= -Wshadow
>  CFLAGS+= -Wtrigraphs
>  CFLAGS+= -Wuninitialized
>  CFLAGS+= -Wunused
> +.if ${COMPILER_VERSION:L} != "gcc3"
> +CFLAGS+= -Wno-pointer-sign
> +.endif
>  
>  CFLAGS+= -DLIBRESSL_INTERNAL
>  
> 



Remove uvm hint address selector

2017-01-15 Thread Stefan Kempf
When uvm pivots are enabled, allocations
with an address hint provided by the user program are
handled by the uaddr_hint selector.

I'd like to remove the uaddr_hint selector. The uaddr_rnd selector
already deals with hinted allocations correctly, so why not use
it for hinted allocations in the pivot selector?

This diff is a part of making pivots work.
Getting rid of the hint selectors makes the code simpler.
I'd to commit this part first, to make the actual pivot
diffs smaller.

ok?

Some more details:

If you have multiple selectors per address space that manage
overlapping address ranges, some selectors have priority
over others. E.g. only the brk selector is allowed to make
allocations within the brk area. The rnd selector
gets all these checks right, but the hint selector does
not, which causes panics when you use pivots and try to
do a hinted allocation within the brk area.

So better use the code which is known to work for hinted
allocations. Once pivots are enabled, maybe the parts of the
rnd allocator that deal with hinted allocations can be factored
out into a common function.

Index: uvm/uvm_addr.c
===
RCS file: /cvs/src/sys/uvm/uvm_addr.c,v
retrieving revision 1.22
diff -u -p -r1.22 uvm_addr.c
--- uvm/uvm_addr.c  16 Sep 2016 02:50:54 -  1.22
+++ uvm/uvm_addr.c  14 Jan 2017 05:53:17 -
@@ -46,17 +46,10 @@
 
 /* Pool with uvm_addr_state structures. */
 struct pool uaddr_pool;
-struct pool uaddr_hint_pool;
 struct pool uaddr_bestfit_pool;
 struct pool uaddr_pivot_pool;
 struct pool uaddr_rnd_pool;
 
-/* uvm_addr state for hint based selector. */
-struct uaddr_hint_state {
-   struct uvm_addr_stateuaddr;
-   vsize_t  max_dist;
-};
-
 /* uvm_addr state for bestfit selector. */
 struct uaddr_bestfit_state {
struct uvm_addr_stateubf_uaddr;
@@ -118,7 +111,6 @@ void uaddr_kremove(struct vm_map *,
 voiduaddr_kbootstrapdestroy(struct uvm_addr_state *);
 
 voiduaddr_destroy(struct uvm_addr_state *);
-voiduaddr_hint_destroy(struct uvm_addr_state *);
 voiduaddr_kbootstrap_destroy(struct uvm_addr_state *);
 voiduaddr_rnd_destroy(struct uvm_addr_state *);
 voiduaddr_bestfit_destroy(struct uvm_addr_state *);
@@ -138,10 +130,6 @@ int uaddr_rnd_select(struct vm_map 
*,
struct uvm_addr_state *, struct vm_map_entry **,
vaddr_t *, vsize_t, vaddr_t, vaddr_t, vm_prot_t,
vaddr_t);
-int uaddr_hint_select(struct vm_map *,
-   struct uvm_addr_state*, struct vm_map_entry **,
-   vaddr_t *, vsize_t, vaddr_t, vaddr_t, vm_prot_t,
-   vaddr_t);
 int uaddr_bestfit_select(struct vm_map *,
struct uvm_addr_state*, struct vm_map_entry **,
vaddr_t *, vsize_t, vaddr_t, vaddr_t, vm_prot_t,
@@ -290,8 +278,6 @@ uvm_addr_init(void)
 {
pool_init(_pool, sizeof(struct uvm_addr_state), 0,
IPL_VM, PR_WAITOK, "uaddr", NULL);
-   pool_init(_hint_pool, sizeof(struct uaddr_hint_state), 0,
-   IPL_VM, PR_WAITOK, "uaddrhint", NULL);
pool_init(_bestfit_pool, sizeof(struct uaddr_bestfit_state), 0,
IPL_VM, PR_WAITOK, "uaddrbest", NULL);
pool_init(_pivot_pool, sizeof(struct uaddr_pivot_state), 0,
@@ -740,116 +726,6 @@ uaddr_rnd_print(struct uvm_addr_state *u
 #endif
 
 /*
- * An allocator that selects an address within distance of the hint.
- *
- * If no hint is given, the allocator refuses to allocate.
- */
-const struct uvm_addr_functions uaddr_hint_functions = {
-   .uaddr_select = _hint_select,
-   .uaddr_destroy = _hint_destroy,
-   .uaddr_name = "uaddr_hint"
-};
-
-/*
- * Create uaddr_hint state.
- */
-struct uvm_addr_state *
-uaddr_hint_create(vaddr_t minaddr, vaddr_t maxaddr, vsize_t max_dist)
-{
-   struct uaddr_hint_state *ua_hint;
-
-   KASSERT(uaddr_hint_pool.pr_size == sizeof(*ua_hint));
-
-   ua_hint = pool_get(_hint_pool, PR_WAITOK);
-   ua_hint->uaddr.uaddr_minaddr = minaddr;
-   ua_hint->uaddr.uaddr_maxaddr = maxaddr;
-   ua_hint->uaddr.uaddr_functions = _hint_functions;
-   ua_hint->max_dist = max_dist;
-   return _hint->uaddr;
-}
-
-/*
- * Destroy uaddr_hint state.
- */
-void
-uaddr_hint_destroy(struct uvm_addr_state *uaddr)
-{
-   pool_put(_hint_pool, uaddr);
-}
-
-/*
- * Hint selector.
- *
- * Attempts to find an address that is within max_dist of the hint.
- */
-int
-uaddr_hint_select(struct vm_map *map, struct uvm_addr_state *uaddr_param,
-struct vm_map_entry **entry_out, vaddr_t *addr_out,
-vsize_t sz, vaddr_t align, vaddr_t offset,
-vm_prot_t prot, 

Re: ed(1) doesn't support adress ranges which begin with comma or semicolon

2017-01-03 Thread Stefan Kempf
Theo Buehler wrote:
> On Sat, Dec 24, 2016 at 10:45:16PM +0100, Theo Buehler wrote:
> > On Sat, Dec 24, 2016 at 05:44:07PM +0100, Jérôme FRGACIC wrote:
> > > Hi @tech,
> > > 
> > > I remark that ed(1) do not support adress ranges which begin with
> > > comma or semicolon, for example ",10p" which is equivalent to "1,10p" or
> > > "; +10p" which is equivalent to ".;+10p". These adress ranges are
> > > specified by Open Group Base Specifications Issue 6 (IEEE Std 1003.1,
> > > 2004 Edition) (in fact, there are also adress ranges like "10," which
> > > is equivalent to "10,10" but they seem useless to me...).
> > > 
> > > I would suggest this diff to add support for these adress ranges.
> > 
> > ok tb@
> 
> anyone willing to commit this or give another ok?

Looks good to me. ok stefan@
 
> > 
> > > Index: main.c
> > > ===
> > > RCS file: /cvs/src/bin/ed/main.c,v
> > > retrieving revision 1.58
> > > diff -u -r1.58 main.c
> > > --- main.c16 Aug 2016 20:04:46 -  1.58
> > > +++ main.c24 Dec 2016 15:26:41 -
> > > @@ -383,7 +383,9 @@
> > >   ibufp++;
> > >   addr_cnt++;
> > >   second_addr = (c == ';') ?
> > > current_addr : 1;
> > > - addr = addr_last;
> > > + addr = next_addr();
> > > + if (addr < 0)
> > > + addr = addr_last;
> > >   break;
> > >   }
> > >   /* FALLTHROUGH */
> > > 
> > > PS : I haven't subcribe to the tech mailing list, so please add me as
> > > recipient if you reply.
> > > PPS : Merry Christmas.
> > > 
> > > 
> > > Kind regards,
> > > 
> > > 
> > > Jérôme FRGACIC
> > > 
> > 
> 



Re: H option for malloc

2016-10-28 Thread Stefan Kempf
Otto Moerbeek wrote:
> Hi,
> 
> Pages in the malloc cache are either reused quickly or unmapped
> quickly. In both cases it does not make sense to set hints on them.
> So remove that option, which is just a remainder of old times when
> mallco used to hold on to pages. OK?

Makes sense. ok stefan@
 
>   -Otto
> 
> Index: lib/libc/stdlib/malloc.c
> ===
> RCS file: /cvs/src/lib/libc/stdlib/malloc.c,v
> retrieving revision 1.207
> diff -u -p -r1.207 malloc.c
> --- lib/libc/stdlib/malloc.c  22 Oct 2016 14:27:19 -  1.207
> +++ lib/libc/stdlib/malloc.c  28 Oct 2016 08:52:05 -
> @@ -176,7 +176,6 @@ struct malloc_readonly {
>   int malloc_mt;  /* multi-threaded mode? */
>   int malloc_freenow; /* Free quickly - disable chunk rnd */
>   int malloc_freeunmap;   /* mprotect free pages PROT_NONE? */
> - int malloc_hint;/* call madvice on free pages?  */
>   int malloc_junk;/* junk fill? */
>   int malloc_move;/* move allocations to end of page? */
>   int malloc_realloc; /* always realloc? */
> @@ -374,8 +373,6 @@ unmap(struct dir_info *d, void *p, size_
>   MALLOC_MAXCHUNK : sz;
>   memset(p, SOME_FREEJUNK, amt);
>   }
> - if (mopts.malloc_hint)
> - madvise(p, sz, MADV_FREE);
>   if (mopts.malloc_freeunmap)
>   mprotect(p, sz, PROT_NONE);
>   r->p = p;
> @@ -447,8 +444,6 @@ map(struct dir_info *d, void *hint, size
>   d->free_regions_size -= psz;
>   if (mopts.malloc_freeunmap)
>   mprotect(p, sz, PROT_READ | PROT_WRITE);
> - if (mopts.malloc_hint)
> - madvise(p, sz, MADV_NORMAL);
>   if (zero_fill)
>   memset(p, 0, sz);
>   else if (mopts.malloc_junk == 2 &&
> @@ -465,8 +460,6 @@ map(struct dir_info *d, void *hint, size
>   r->p = (char *)r->p + (psz << MALLOC_PAGESHIFT);
>   if (mopts.malloc_freeunmap)
>   mprotect(p, sz, PROT_READ | PROT_WRITE);
> - if (mopts.malloc_hint)
> - madvise(p, sz, MADV_NORMAL);
>   r->size -= psz;
>   d->free_regions_size -= psz;
>   if (zero_fill)
> @@ -531,12 +524,6 @@ omalloc_parseopt(char opt)
>   break;
>   case 'G':
>   mopts.malloc_guard = MALLOC_PAGESIZE;
> - break;
> - case 'h':
> - mopts.malloc_hint = 0;
> - break;
> - case 'H':
> - mopts.malloc_hint = 1;
>   break;
>   case 'j':
>   if (mopts.malloc_junk > 0)
> Index: share/man/man5/malloc.conf.5
> ===
> RCS file: /cvs/src/share/man/man5/malloc.conf.5,v
> retrieving revision 1.9
> diff -u -p -r1.9 malloc.conf.5
> --- share/man/man5/malloc.conf.5  17 Oct 2016 06:29:08 -  1.9
> +++ share/man/man5/malloc.conf.5  28 Oct 2016 08:52:09 -
> @@ -73,10 +73,6 @@ option for security).
>  Enable guard pages.
>  Each page size or larger allocation is followed by a guard page that will
>  cause a segmentation fault upon any access.
> -.It Cm H
> -.Dq Hint .
> -Pass a hint to the kernel about pages we don't use.
> -If the machine is paging a lot this may help a bit.
>  .It Cm J
>  .Dq More junking .
>  Increase the junk level by one if it is smaller than 2.
> 
> 



Re: C startup code: make crtbegin code safe for clang

2016-08-07 Thread Stefan Kempf
Philip Guenther wrote:
> On Mon, Aug 1, 2016 at 11:45 AM, Mark Kettenis <mark.kette...@xs4all.nl> 
> wrote:
> >> From: j...@wxcvbn.org (Jeremie Courreges-Anglas)
> >> Date: Mon, 01 Aug 2016 20:30:33 +0200
> >>
> >> Stefan Kempf <sisnk...@gmail.com> writes:
> >>
> >> > The constructor and destructor tables are declared as arrays with one
> >> > non-NULL element. Walking those until a NULL element is reached looks
> >> > like out-of-bound accesses to newer compilers, and they turn the code
> >> > into infinite loops (e.g. clang 3.8), because it is undefined behavior.
> 
> So it needs to reference the pointers via an extern incomplete array?
> Can we move to setting up the leading count via the linker script
> magic documented in the ld info page, and then just use __CTOR_LIST__
> as the extern array?
 
Here's an attempt at this. The startup code itself is unchanged except for
using extern instead static arrays now.

This needs building and installing binutils first. Afterwards lib/csu
can be recompiled.

Does that diff look better?

Index: gnu/usr.bin/binutils-2.17/ld/emulparams/elf_obsd.sh
===
RCS file: /cvs/src/gnu/usr.bin/binutils-2.17/ld/emulparams/elf_obsd.sh,v
retrieving revision 1.2
diff -u -p -r1.2 elf_obsd.sh
--- gnu/usr.bin/binutils-2.17/ld/emulparams/elf_obsd.sh 30 Apr 2015 17:56:18 
-  1.2
+++ gnu/usr.bin/binutils-2.17/ld/emulparams/elf_obsd.sh 7 Aug 2016 17:42:23 
-
@@ -7,4 +7,23 @@ PAD_GOT=
 PAD_PLT=
 DATA_START_SYMBOLS='__data_start = . ;'
 
+if [ "$ELFSIZE" = 64 ]
+then
+   CTOR_START="PROVIDE_HIDDEN(__CTOR_LIST__ = .); \
+   QUAD((__CTOR_END__ - __CTOR_LIST__) / 8 - 2);"
+   CTOR_END="QUAD(0); PROVIDE_HIDDEN(__CTOR_END__ = .);"
+
+   DTOR_START="PROVIDE_HIDDEN(__DTOR_LIST__ = .); \
+   QUAD((__DTOR_END__ - __DTOR_LIST__) / 8 - 2);"
+   DTOR_END="QUAD(0); PROVIDE_HIDDEN(__DTOR_END__ = .);"
+else
+   CTOR_START="PROVIDE_HIDDEN(__CTOR_LIST__ = .); \
+   LONG((__CTOR_END__ - __CTOR_LIST__) / 4 - 2);"
+   CTOR_END="LONG(0); PROVIDE_HIDDEN(__CTOR_END__ = .);"
+
+   DTOR_START="PROVIDE_HIDDEN(__DTOR_LIST__ = .); \
+   LONG((__DTOR_END__ - __DTOR_LIST__) / 4 - 2);"
+   DTOR_END="LONG(0); PROVIDE_HIDDEN(__DTOR_END__ = .);"
+fi
+
 unset SEPARATE_GOTPLT
Index: gnu/usr.bin/binutils-2.17/ld/scripttempl/elf.sc
===
RCS file: /cvs/src/gnu/usr.bin/binutils-2.17/ld/scripttempl/elf.sc,v
retrieving revision 1.8
diff -u -p -r1.8 elf.sc
--- gnu/usr.bin/binutils-2.17/ld/scripttempl/elf.sc 9 Aug 2014 04:49:47 
-   1.8
+++ gnu/usr.bin/binutils-2.17/ld/scripttempl/elf.sc 7 Aug 2016 17:42:23 
-
@@ -203,7 +203,8 @@ fi
 CTOR=".ctors${CONSTRUCTING-0} : 
   {
 ${CONSTRUCTING+${CTOR_START}}
-/* gcc uses crtbegin.o to find the start of
+/* on some systems,
+   gcc uses crtbegin.o to find the start of
the constructors, so we make sure it is
first.  Because this is a wildcard, it
doesn't matter if the user does not
@@ -218,7 +219,8 @@ CTOR=".ctors${CONSTRUCTING-0} : 
 /* We don't want to include the .ctor section from
the crtend.o file until after the sorted ctors.
The .ctor section from the crtend file contains the
-   end of ctors marker and it must be last */
+   end of ctors marker and it must be last
+   on some systems */
 
 KEEP (*(EXCLUDE_FILE (*crtend*.o $OTHER_EXCLUDE_FILES) .ctors))
 KEEP (*(SORT(.ctors.*)))
@@ -371,7 +373,11 @@ cat < >> > While there, clean up crtbegin.c and crtbegin.S a little and make them
> >> > more similar.
> 
> I'm fine with this being done...once we sure we actually have nailed
> down what are the actual changes necessary to get the code to compile
> correctly without changing behavior.
> 
> ...
> >> The existing code tries to retrieve the number of valid ctors entries
> >> from __CTOR_LIST__[0], only when that number is -1 it tries to find
> >> the actual value by walking the array.
> 
> Since we only have one version of ld doing linking now, can't we
> switch from the -1 to the real count version?
> 
> ...
> > Also, aren't ctor_list and dtor_list polluting the namespace?
> 
> Yep.
> 
> 
> Philip



Re: C startup code: make crtbegin code safe for clang

2016-08-03 Thread Stefan Kempf
Philip Guenther wrote:
> On Mon, Aug 1, 2016 at 11:45 AM, Mark Kettenis <mark.kette...@xs4all.nl> 
> wrote:
> >> From: j...@wxcvbn.org (Jeremie Courreges-Anglas)
> >> Date: Mon, 01 Aug 2016 20:30:33 +0200
> >>
> >> Stefan Kempf <sisnk...@gmail.com> writes:
> >>
> >> > The constructor and destructor tables are declared as arrays with one
> >> > non-NULL element. Walking those until a NULL element is reached looks
> >> > like out-of-bound accesses to newer compilers, and they turn the code
> >> > into infinite loops (e.g. clang 3.8), because it is undefined behavior.
> 
> So it needs to reference the pointers via an extern incomplete array?
> Can we move to setting up the leading count via the linker script
> magic documented in the ld info page, and then just use __CTOR_LIST__
> as the extern array?
 
Right, we need the extern trick.
I'll take a look into doing it with counts and __CTOR_LIST__ and
__CTOR_END__. We want __CTOR_LIST__ and __CTOR_END__ to be .hidden
symbols, right?
 
> >> > While there, clean up crtbegin.c and crtbegin.S a little and make them
> >> > more similar.
> 
> I'm fine with this being done...once we sure we actually have nailed
> down what are the actual changes necessary to get the code to compile
> correctly without changing behavior.
> 
> ...
> >> The existing code tries to retrieve the number of valid ctors entries
> >> from __CTOR_LIST__[0], only when that number is -1 it tries to find
> >> the actual value by walking the array.
> 
> Since we only have one version of ld doing linking now, can't we
> switch from the -1 to the real count version?
> 
> ...
> > Also, aren't ctor_list and dtor_list polluting the namespace?
> 
> Yep.
 
> Philip



C startup code: make crtbegin code safe for clang

2016-08-01 Thread Stefan Kempf
The constructor and destructor tables are declared as arrays with one
non-NULL element. Walking those until a NULL element is reached looks
like out-of-bound accesses to newer compilers, and they turn the code
into infinite loops (e.g. clang 3.8), because it is undefined behavior.

Use constructor/destructor calling code that should be legal in both the
gcc and clang C dialect, to hopefully be immune from undefined behavior
optimizations in the future.

While there, clean up crtbegin.c and crtbegin.S a little and make them
more similar.

ok?

Index: lib/csu/crtbegin.c
===
RCS file: /cvs/src/lib/csu/crtbegin.c,v
retrieving revision 1.20
diff -u -p -r1.20 crtbegin.c
--- lib/csu/crtbegin.c  10 Nov 2015 04:14:03 -  1.20
+++ lib/csu/crtbegin.c  1 Aug 2016 16:56:31 -
@@ -85,36 +85,37 @@ long __guard_local __dso_hidden __attrib
 
 
 static const init_f __CTOR_LIST__[1]
-__attribute__((section(".ctors"))) = { (void *)-1 };   /* XXX */
+__used __attribute__((section(".ctors"))) = { (void *)-1 };/* XXX 
*/
 static const init_f __DTOR_LIST__[1]
-__attribute__((section(".dtors"))) = { (void *)-1 };   /* XXX */
+__used __attribute__((section(".dtors"))) = { (void *)-1 };/* XXX 
*/
+
+extern const init_f ctor_list[] asm(".ctors");
+extern const init_f dtor_list[] asm(".dtors");
 
 static void__dtors(void) __used;
 static void__ctors(void) __used;
 
 static void
-__ctors()
+__ctors(void)
 {
-   unsigned long i = (unsigned long) __CTOR_LIST__[0];
-   const init_f *p;
+   int i;
+
+   for (i = 0; ctor_list[i + 1] != NULL; i++)
+   continue;
 
-   if (i == -1)  {
-   for (i = 1; __CTOR_LIST__[i] != NULL; i++)
-   ;
+   while (i > 0) {
+   ctor_list[i]();
i--;
}
-   p = __CTOR_LIST__ + i;
-   while (i--)
-   (**p--)();
 }
 
 static void
-__dtors()
+__dtors(void)
 {
-   const init_f *p = __DTOR_LIST__ + 1;
+   int i;
 
-   while (*p)
-   (**p++)();
+   for (i = 1; dtor_list[i] != NULL; i++)
+   dtor_list[i]();
 }
 
 void __init(void);
@@ -130,8 +131,8 @@ MD_SECT_CALL_FUNC(".init", __do_init);
 MD_SECT_CALL_FUNC(".fini", __do_fini);
 
 
-void
-__do_init()
+static void
+__do_init(void)
 {
static int initialized = 0;
static struct dwarf2_eh_object object;
@@ -146,14 +147,14 @@ __do_init()
__register_frame_info(__EH_FRAME_BEGIN__, );
if (__JCR_LIST__[0] && _Jv_RegisterClasses)
_Jv_RegisterClasses(__JCR_LIST__);
-   (__ctors)();
+   __ctors();
 
atexit(__fini);
}
 }
 
-void
-__do_fini()
+static void
+__do_fini(void)
 {
static int finalized = 0;
 
@@ -162,7 +163,7 @@ __do_fini()
/*
 * Call global destructors.
 */
-   (__dtors)();
+   __dtors();
}
 }
 
Index: lib/csu/crtbeginS.c
===
RCS file: /cvs/src/lib/csu/crtbeginS.c,v
retrieving revision 1.16
diff -u -p -r1.16 crtbeginS.c
--- lib/csu/crtbeginS.c 7 Apr 2015 01:27:06 -   1.16
+++ lib/csu/crtbeginS.c 1 Aug 2016 16:56:31 -
@@ -38,7 +38,6 @@
  * constructors and destructors. The first element contains the
  * number of pointers in each.
  * The tables are also null-terminated.
-
  */
 #include 
 
@@ -96,39 +95,39 @@ asm(".hidden pthread_atfork\n .weak pthr
 
 
 static init_f __CTOR_LIST__[1]
-__attribute__((section(".ctors"))) = { (void *)-1 };   /* XXX */
+__used __attribute__((section(".ctors"))) = { (void *)-1 };/* XXX 
*/
 static init_f __DTOR_LIST__[1]
-__attribute__((section(".dtors"))) = { (void *)-1 };   /* XXX */
+__used __attribute__((section(".dtors"))) = { (void *)-1 };/* XXX 
*/
+
+extern const init_f ctor_list[] asm(".ctors");
+extern const init_f dtor_list[] asm(".dtors");
 
 static void__dtors(void) __used;
 static void__ctors(void) __used;
 
-void
+static void
 __ctors(void)
 {
-   unsigned long i = (unsigned long) __CTOR_LIST__[0];
-   init_f *p;
+   int i;
 
-   if (i == -1)  {
-   for (i = 1; __CTOR_LIST__[i] != NULL; i++)
-   ;
+   for (i = 0; ctor_list[i + 1] != NULL; i++)
+   continue;
+
+   while (i > 0) {
+   ctor_list[i]();
i--;
}
-   p = __CTOR_LIST__ + i;
-   while (i--) {
-   (**p--)();
-   }
 }
 
 static void
 __dtors(void)
 {
-   init_f *p = __DTOR_LIST__ + 1;
+   int i;
 
-   while (*p) {
-   (**p++)();
-   }
+   for (i = 1; dtor_list[i] != NULL; i++)
+   dtor_list[i]();
 }
+
 void _init(void);
 void _fini(void);
 static void _do_init(void) __used;
@@ -141,7 +140,7 @@ 

Use of uninitialized variables in binutils-2.17

2016-07-31 Thread Stefan Kempf
clang errors out about use of undefined variables when building
binutils-2.17.

In elf.c, do not increment `s' before it is initialized. At the time
of the increment, `s' is otherwise unused anyway.

In elflink.c, initialize sec_contents and l_sec_contents to make
sure that the free(sec_contents) and free(l_sec_contents) are called
on valid pointers.

ok?

Index: gnu/usr.bin/binutils-2.17/bfd/elf.c
===
RCS file: /cvs/src/gnu/usr.bin/binutils-2.17/bfd/elf.c,v
retrieving revision 1.10
diff -u -p -r1.10 elf.c
--- gnu/usr.bin/binutils-2.17/bfd/elf.c 26 Jul 2016 02:38:12 -  1.10
+++ gnu/usr.bin/binutils-2.17/bfd/elf.c 31 Jul 2016 14:54:14 -
@@ -8617,7 +8617,7 @@ _bfd_elf_get_synthetic_symtab (bfd *abfd
   count = relplt->size / hdr->sh_entsize;
   size = count * sizeof (asymbol);
   p = relplt->relocation;
-  for (i = 0; i < count; i++, s++, p++)
+  for (i = 0; i < count; i++, p++)
 size += strlen ((*p->sym_ptr_ptr)->name) + sizeof ("@plt");
 
   s = *ret = bfd_malloc (size);
Index: gnu/usr.bin/binutils-2.17/bfd/elflink.c
===
RCS file: /cvs/src/gnu/usr.bin/binutils-2.17/bfd/elflink.c,v
retrieving revision 1.17
diff -u -p -r1.17 elflink.c
--- gnu/usr.bin/binutils-2.17/bfd/elflink.c 22 Jun 2016 13:29:14 -  
1.17
+++ gnu/usr.bin/binutils-2.17/bfd/elflink.c 31 Jul 2016 14:54:15 -
@@ -9862,7 +9862,7 @@ _bfd_elf_section_already_linked (bfd *ab
   abfd, sec);
  else if (sec->size != 0)
{
- bfd_byte *sec_contents, *l_sec_contents;
+ bfd_byte *sec_contents = NULL, *l_sec_contents = NULL;
 
  if (!bfd_malloc_and_get_section (abfd, sec, _contents))
(*_bfd_error_handler)



Re: unsigned comparison issue in sys_mmap()?

2016-06-01 Thread Stefan Kempf
Ingo Schwarze wrote:
> Hi,
> 
> when p->p_rlimit[RLIMIT_DATA].rlim_cur < ptoa(p->p_vmspace->vm_dused),
> the following comparison fails because the subtraction wraps around
> to huge positive values.
> 
> I noticed this because i tried to do systematic testing of specific
> malloc failure codepaths.  But whenever i used setrlimit(2) to set
> a low RLIMIT_DATA, a subsequent malloc(3) with a big argument always
> succeeded.
> 
> It may be unsusual for a program to lower its own RLIMIT_DATA, and
> i'm neither sure whether there are other use cases besides testing
> nor whether there are other ways to arrive in a situation with
> p->p_rlimit[RLIMIT_DATA].rlim_cur < ptoa(p->p_vmspace->vm_dused)
> besides late setrlimit(2) to a low value.
> 
> But even if there are no use cases besides testing, do we want
> to have an unsigned comparison in the kernel that can wrap around?
> 
> I can hardly believe i'm sending such a diff...  :-o
> 
> I'm running it right now.  So far, i noticed no regressions,
> and testing of malloc failure codepaths has become much easier.
> I consider that useful to make out software better.
> 
> What do you think?

Agreed that we need to check for wraparound here.

For some reason the 
p->p_rlimit[RLIMIT_DATA].rlim_cur - size < ptoa(p->p_vmspace->vm_dused)
confuses me, although the operands compared to the original check are
just re-arranged.

How about writing it like this?

ptoa(p->p_vmspace->vm_used) > p->p_rlimit[RLIMIT_DATA].rlim_cur ||
size > (p->p_rlimit[RLIMIT_DATA].rlim_cur - ptoa(p->p_vmspace->vm_dused))
 
> Yours,
>   Ingo
> 
> 
> Index: uvm/uvm_mmap.c
> ===
> RCS file: /cvs/src/sys/uvm/uvm_mmap.c,v
> retrieving revision 1.130
> diff -u -p -r1.130 uvm_mmap.c
> --- uvm/uvm_mmap.c1 Jun 2016 04:53:54 -   1.130
> +++ uvm/uvm_mmap.c2 Jun 2016 01:23:20 -
> @@ -526,8 +526,9 @@ sys_mmap(struct proc *p, void *v, regist
>   }
>   if ((flags & MAP_ANON) != 0 ||
>   ((flags & MAP_PRIVATE) != 0 && (prot & PROT_WRITE) != 0)) {
> - if (size >
> - (p->p_rlimit[RLIMIT_DATA].rlim_cur - 
> ptoa(p->p_vmspace->vm_dused))) {
> + if (p->p_rlimit[RLIMIT_DATA].rlim_cur < size ||
> + p->p_rlimit[RLIMIT_DATA].rlim_cur - size <
> + ptoa(p->p_vmspace->vm_dused)) {
>   error = ENOMEM;
>   goto out;
>   }
> @@ -545,8 +546,9 @@ is_anon:  /* label for SunOS style /dev/z
>  
>   if ((flags & MAP_ANON) != 0 ||
>   ((flags & MAP_PRIVATE) != 0 && (prot & PROT_WRITE) != 0)) {
> - if (size >
> - (p->p_rlimit[RLIMIT_DATA].rlim_cur - 
> ptoa(p->p_vmspace->vm_dused))) {
> + if (p->p_rlimit[RLIMIT_DATA].rlim_cur < size ||
> + p->p_rlimit[RLIMIT_DATA].rlim_cur - size <
> + ptoa(p->p_vmspace->vm_dused)) {
>   return ENOMEM;
>   }
>   }
> 



procmap: map vnodes to names again

2016-05-26 Thread Stefan Kempf
This re-introduces the vnode-to-filename mapping code with adaptions to
the new namecache layout.

For example, running procmap -a 1 now prints the .text, .data, and
.rodata segments of init(1) as below, if /sbin/init is in the
name cache:

104ccb70-104ccb73dfff 248k 0 r-xp+ (rwx) 1/0/0 04:00 42 - /sbin/init 
[0xff043c977898]
104ccb83d000-104ccb847fff  44k 3d000 r--p+ (rwx) 1/0/0 04:00 42 - /sbin/init 
[0xff043c977898]
104ccbb48000-104ccbb48fff   4k 48000 rw-p+ (rwx) 1/0/0 04:00 42 - /sbin/init 
[0xff043c977898]

ok?

Index: usr.sbin/procmap/procmap.1
===
RCS file: /cvs/src/usr.sbin/procmap/procmap.1,v
retrieving revision 1.20
diff -u -p -r1.20 procmap.1
--- usr.sbin/procmap/procmap.1  13 Mar 2015 19:58:41 -  1.20
+++ usr.sbin/procmap/procmap.1  26 May 2016 08:02:58 -
@@ -90,6 +90,8 @@ dump the process's vm_map structure
 dump the vm_map.header structure
 .It Cm 8
 dump each vm_map_entry in its entirety
+.It Cm 16
+dump the namei cache as it is traversed
 .El
 .It Fl d
 Dumps the vm_map and vm_map_entry structures in a style similar to
Index: usr.sbin/procmap/procmap.c
===
RCS file: /cvs/src/usr.sbin/procmap/procmap.c,v
retrieving revision 1.61
diff -u -p -r1.61 procmap.c
--- usr.sbin/procmap/procmap.c  25 May 2016 15:45:53 -  1.61
+++ usr.sbin/procmap/procmap.c  26 May 2016 08:02:58 -
@@ -38,6 +38,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 /* XXX until uvm gets cleaned up */
@@ -79,6 +80,7 @@ typedef int boolean_t;
 #define PRINT_VM_MAP   0x0002
 #define PRINT_VM_MAP_HEADER0x0004
 #define PRINT_VM_MAP_ENTRY 0x0008
+#define DUMP_NAMEI_CACHE   0x0010
 
 struct cache_entry {
LIST_ENTRY(cache_entry) ce_next;
@@ -89,8 +91,10 @@ struct cache_entry {
 };
 
 LIST_HEAD(cache_head, cache_entry) lcache;
+TAILQ_HEAD(namecache_head, namecache) nclruhead;
+int namecache_loaded;
 void *uvm_vnodeops, *uvm_deviceops, *aobj_pager;
-u_long kernel_map_addr;
+u_long kernel_map_addr, nclruhead_addr;
 int debug, verbose;
 int print_all, print_map, print_maps, print_solaris, print_ddb, print_amap;
 int rwx = PROT_READ | PROT_WRITE | PROT_EXEC;
@@ -165,6 +169,8 @@ struct nlist nl[] = {
 #define NL_AOBJ_PAGER  3
{ "_kernel_map" },
 #define NL_KERNEL_MAP  4
+   { "_nclruhead" },
+#define NL_NCLRUHEAD   5
{ NULL }
 };
 
@@ -178,6 +184,8 @@ size_t dump_vm_map_entry(kvm_t *, struct
 char *findname(kvm_t *, struct kbit *, struct vm_map_entry *, struct kbit *,
 struct kbit *, struct kbit *);
 int search_cache(kvm_t *, struct kbit *, char **, char *, size_t);
+void load_name_cache(kvm_t *);
+void cache_enter(struct namecache *);
 static void __dead usage(void);
 static pid_t strtopid(const char *);
 void print_sum(struct sum *, struct sum *);
@@ -219,7 +227,7 @@ main(int argc, char *argv[])
print_ddb = 1;
break;
case 'D':
-   debug = strtonum(optarg, 0, 0xf, );
+   debug = strtonum(optarg, 0, 0x1f, );
if (errstr)
errx(1, "invalid debug mask");
break;
@@ -524,6 +532,8 @@ load_symbols(kvm_t *kd)
uvm_deviceops = (void*)nl[NL_UVM_DEVICEOPS].n_value;
aobj_pager =(void*)nl[NL_AOBJ_PAGER].n_value;
 
+   nclruhead_addr = nl[NL_NCLRUHEAD].n_value;
+
_KDEREF(kd, nl[NL_MAXSSIZ].n_value, ,
sizeof(maxssiz));
_KDEREF(kd, nl[NL_KERNEL_MAP].n_value, _map_addr,
@@ -889,6 +899,9 @@ search_cache(kvm_t *kd, struct kbit *vp,
char *o, *e;
u_long cid;
 
+   if (!namecache_loaded)
+   load_name_cache(kd);
+
P() = P(vp);
S() = sizeof(struct vnode);
cid = D(vp, vnode)->v_id;
@@ -900,8 +913,11 @@ search_cache(kvm_t *kd, struct kbit *vp,
if (ce->ce_vp == P() && ce->ce_cid == cid)
break;
if (ce && ce->ce_vp == P() && ce->ce_cid == cid) {
-   if (o != e)
+   if (o != e) {
+   if (o <= buf)
+   break;
*(--o) = '/';
+   }
if (o - ce->ce_nlen <= buf)
break;
o -= ce->ce_nlen;
@@ -919,6 +935,56 @@ search_cache(kvm_t *kd, struct kbit *vp,
 
KDEREF(kd, );
return (D(, vnode)->v_flag & VROOT);
+}
+
+void
+load_name_cache(kvm_t *kd)
+{
+   struct namecache n, *tmp;
+   struct namecache_head nchead;
+
+   LIST_INIT();
+   _KDEREF(kd, nclruhead_addr, , sizeof(nchead));
+   tmp = TAILQ_FIRST();
+   while (tmp != NULL) {
+   _KDEREF(kd, (u_long)tmp, , 

Re: ld patch that greatly speeds up linking large programs with debug symbols

2016-05-07 Thread Stefan Kempf
Aaron Miller wrote:
> Hi All,
> 
> I was experiencing ~8 minute linking times for a large C++ application
> I have been working on when running -current on amd64. It turns out
> that the decade-old version of ld in the OpenBSD source tree has a bug
> that causes quadratic complexity for some linking operations when
> debug symbols are present. I found a patch from 2006 in a mailing list
> archive that fixes this; I adapted it to work with OpenBSD by editing
> files that are normally regenerated (I couldn't figure out how to do
> automatically regenerate them).
> 
> Here is the original patch:
> https://sourceware.org/ml/binutils/2006-08/msg00334.html
> 
> I have rebuilt the kernel and system with this ld and everything runs
> identically to the system linked with the unpatched ld.
> 
> Please test this and let me know if this could get into the tree, and
> if not, what changes I need to make. Thanks!

Interesting!

About the autogenerated files: it looks you already made the additions
to bfd/linker.c, bfd/targets.c and bfd/libbfd.h that should end up
in bfd-in2.h and libbfd-in.h.

To regenerate them, make sure you first did a "make obj" in the root of
your source tree. Then run "make depend" from gnu/usr.bin. Afterwards,
to to gnu/usr.bin/binutils-2.17/obj/bfd and run "make headers".
headers". That did the trick for me.

PS: It appears that gmail mangled your diff. It does not apply cleanly
here. You probably need to use a different mail client or try sending the
diff as a plain text attachment. Can you please resend an un-mangled
diff?
 
> Index: bfd/bfd-in2.h
> ===
> RCS file: /cvs/src/gnu/usr.bin/binutils-2.17/bfd/bfd-in2.h,v
> retrieving revision 1.4
> diff -u -p -r1.4 bfd-in2.h
> --- bfd/bfd-in2.h25 May 2015 14:56:26 -1.4
> +++ bfd/bfd-in2.h17 Apr 2016 05:14:15 -
> @@ -5068,7 +5068,7 @@ typedef struct bfd_target
> 
>/* Check if SEC has been already linked during a reloceatable or
>   final link.  */
> -  void (*_section_already_linked) (bfd *, struct bfd_section *);
> +  void (*_section_already_linked) (bfd *, struct bfd_section *,
> struct bfd_link_info *);
> 
>/* Routines to handle dynamic symbols and relocs.  */
>  #define BFD_JUMP_TABLE_DYNAMIC(NAME) \
> @@ -5128,10 +5128,10 @@ bfd_boolean bfd_link_split_section (bfd
>  #define bfd_link_split_section(abfd, sec) \
> BFD_SEND (abfd, _bfd_link_split_section, (abfd, sec))
> 
> -void bfd_section_already_linked (bfd *abfd, asection *sec);
> +void bfd_section_already_linked (bfd *abfd, asection *sec, struct
> bfd_link_info *);
> 
> -#define bfd_section_already_linked(abfd, sec) \
> -   BFD_SEND (abfd, _section_already_linked, (abfd, sec))
> +#define bfd_section_already_linked(abfd, sec, info) \
> +   BFD_SEND (abfd, _section_already_linked, (abfd, sec, info))
> 
>  /* Extracted from simple.c.  */
>  bfd_byte *bfd_simple_get_relocated_section_contents
> Index: bfd/elf-bfd.h
> ===
> RCS file: /cvs/src/gnu/usr.bin/binutils-2.17/bfd/elf-bfd.h,v
> retrieving revision 1.4
> diff -u -p -r1.4 elf-bfd.h
> --- bfd/elf-bfd.h25 Aug 2015 02:24:49 -1.4
> +++ bfd/elf-bfd.h17 Apr 2016 05:14:16 -
> @@ -1371,6 +1371,9 @@ struct elf_obj_tdata
> 
>/* Used to determine if we are creating an executable.  */
>bfd_boolean executable;
> +
> +  /* Symbol buffer.  */
> +  Elf_Internal_Sym *symbuf;
>  };
> 
>  #define elf_tdata(bfd)((bfd) -> tdata.elf_obj_data)
> @@ -1503,11 +1506,11 @@ extern bfd_boolean _bfd_elf_match_sectio
>  extern bfd_boolean bfd_elf_is_group_section
>(bfd *, const struct bfd_section *);
>  extern void _bfd_elf_section_already_linked
> -  (bfd *, struct bfd_section *);
> +  (bfd *, struct bfd_section *, struct bfd_link_info *);
>  extern void bfd_elf_set_group_contents
>(bfd *, asection *, void *);
>  extern asection *_bfd_elf_check_kept_section
> -  (asection *);
> +  (asection *, struct bfd_link_info *);
>  extern void _bfd_elf_link_just_syms
>(asection *, struct bfd_link_info *);
>  extern bfd_boolean _bfd_elf_copy_private_header_data
> @@ -1703,7 +1706,7 @@ extern bfd_boolean _bfd_elf_symbol_refs_
>(struct elf_link_hash_entry *, struct bfd_link_info *, bfd_boolean);
> 
>  extern bfd_boolean bfd_elf_match_symbols_in_sections
> -  (asection *sec1, asection *sec2);
> +  (asection *, asection *, struct bfd_link_info *);
> 
>  extern bfd_boolean _bfd_elf_setup_sections
>(bfd *);
> Index: bfd/elf.c
> ===
> RCS file: /cvs/src/gnu/usr.bin/binutils-2.17/bfd/elf.c,v
> retrieving revision 1.7
> diff -u -p -r1.7 elf.c
> --- bfd/elf.c13 Jan 2015 20:05:43 -1.7
> +++ bfd/elf.c17 Apr 2016 05:14:18 -
> @@ -3101,7 +3101,7 @@ assign_section_numbers (bfd *abfd, struc
>   s, s->owner);
>/* Point to the kept section if it has the same
>   

Re: numerous statfs bugs

2016-04-24 Thread Stefan Kempf
Martin Natano wrote:
> There seem to be a number of issues with statfs related code in the
> kernel. The first issue is inside of the copy_statfs_info() function
> which is designed to be used by the filesystem's .vfs_statfs
> implementations to copy data from mp->mnt_stat to the target stat
> buffer. copy_statfs_info() always copies the ufs_args from the
> mount_info union, although the function is also used by non-ufs
> filesystems. Copying the whole union instead of just one member should
> do the trick for the general case.
> 
> statfs(2) returns incomplete information for most filesystems: cd9660,
> udf, msdosfs and nfsv2 don't set f_namemax. ntfs and ext2fs don't set
> f_namemeax and f_favail. fusefs doesn't set f_mntfromspec, f_favail and
> f_iosize.
> 
> In sys_statfs(), the mount point specific >mnt_stat structure is
> passed as the stat buffer to VFS_STATFS. When looking at the case where
> (sb != >mnt-stat), the situation is even worse. cd9660, udf, fusefs,
> msdosfs, ntfs and ext2fs don't use copy_statfs_info(), so there is a lot
> of unset members in the stat buffer (f_fsid, f_owner, f_flags,
> f_syncwrites, f_asyncwrites, f_syncreads, f_asyncreads, f_mntonname,
> f_mntfromname and f_mntfromspec).
> 
> nfs copies MNAMELEN bytes from a smaller buffer into f_mntonname, so
> there is garbage after the null byte.
> 
> Diff below fixes all those issues. Ok?

Diff reads good to me. Any reason why you changed setting f_mntfromname
from "fusefs" to "fuse"?
 
> natano
> 
> 
> Index: isofs/cd9660/cd9660_vfsops.c
> ===
> RCS file: /cvs/src/sys/isofs/cd9660/cd9660_vfsops.c,v
> retrieving revision 1.77
> diff -u -p -r1.77 cd9660_vfsops.c
> --- isofs/cd9660/cd9660_vfsops.c  27 Mar 2016 11:39:37 -  1.77
> +++ isofs/cd9660/cd9660_vfsops.c  19 Apr 2016 18:52:52 -
> @@ -383,6 +383,7 @@ iso_mountfs(devvp, mp, p, argp)
>   mp->mnt_data = (qaddr_t)isomp;
>   mp->mnt_stat.f_fsid.val[0] = (long)dev;
>   mp->mnt_stat.f_fsid.val[1] = mp->mnt_vfc->vfc_typenum;
> + mp->mnt_stat.f_namemax = NAME_MAX;
>   mp->mnt_flag |= MNT_LOCAL;
>   isomp->im_mountp = mp;
>   isomp->im_dev = dev;
> @@ -650,13 +651,9 @@ cd9660_statfs(mp, sbp, p)
>   sbp->f_bavail = 0; /* blocks free for non superuser */
>   sbp->f_files =  0; /* total files */
>   sbp->f_ffree = 0; /* free file nodes */
> - if (sbp != >mnt_stat) {
> - bcopy(mp->mnt_stat.f_mntonname, sbp->f_mntonname, MNAMELEN);
> - bcopy(mp->mnt_stat.f_mntfromname, sbp->f_mntfromname,
> - MNAMELEN);
> - bcopy(>mnt_stat.mount_info.iso_args,
> - >mount_info.iso_args, sizeof(struct iso_args));
> - }
> + sbp->f_favail = 0; /* file nodes free for non superuser */
> + copy_statfs_info(sbp, mp);
> +
>   return (0);
>  }
>  
> Index: isofs/udf/udf_vfsops.c
> ===
> RCS file: /cvs/src/sys/isofs/udf/udf_vfsops.c,v
> retrieving revision 1.49
> diff -u -p -r1.49 udf_vfsops.c
> --- isofs/udf/udf_vfsops.c27 Mar 2016 11:39:37 -  1.49
> +++ isofs/udf/udf_vfsops.c19 Apr 2016 18:52:52 -
> @@ -264,6 +264,7 @@ udf_mountfs(struct vnode *devvp, struct 
>   mp->mnt_data = (qaddr_t) ump;
>   mp->mnt_stat.f_fsid.val[0] = devvp->v_rdev;
>   mp->mnt_stat.f_fsid.val[1] = mp->mnt_vfc->vfc_typenum;
> + mp->mnt_stat.f_namemax = NAME_MAX;
>   mp->mnt_flag |= MNT_LOCAL;
>  
>   ump->um_mountp = mp;
> @@ -542,6 +543,8 @@ udf_statfs(struct mount *mp, struct stat
>   sbp->f_bavail = 0;
>   sbp->f_files = 0;
>   sbp->f_ffree = 0;
> + sbp->f_favail = 0;
> + copy_statfs_info(sbp, mp);
>  
>   return (0);
>  }
> Index: kern/vfs_subr.c
> ===
> RCS file: /cvs/src/sys/kern/vfs_subr.c,v
> retrieving revision 1.244
> diff -u -p -r1.244 vfs_subr.c
> --- kern/vfs_subr.c   7 Apr 2016 09:58:11 -   1.244
> +++ kern/vfs_subr.c   19 Apr 2016 18:52:52 -
> @@ -2276,6 +2276,6 @@ copy_statfs_info(struct statfs *sbp, con
>   memcpy(sbp->f_mntonname, mp->mnt_stat.f_mntonname, MNAMELEN);
>   memcpy(sbp->f_mntfromname, mp->mnt_stat.f_mntfromname, MNAMELEN);
>   memcpy(sbp->f_mntfromspec, mp->mnt_stat.f_mntfromspec, MNAMELEN);
> - memcpy(>mount_info.ufs_args, >mnt_stat.mount_info.ufs_args,
> - sizeof(struct ufs_args));
> + memcpy(>mount_info, >mnt_stat.mount_info,
> + sizeof(union mount_info));
>  }
> Index: miscfs/fuse/fuse_vfsops.c
> ===
> RCS file: /cvs/src/sys/miscfs/fuse/fuse_vfsops.c,v
> retrieving revision 1.20
> diff -u -p -r1.20 fuse_vfsops.c
> --- miscfs/fuse/fuse_vfsops.c 27 Mar 2016 11:39:37 -  1.20
> +++ miscfs/fuse/fuse_vfsops.c 19 Apr 2016 18:52:52 -
> @@ -111,7 +111,9 @@ fusefs_mount(struct mount *mp, const 

Re: uvm: shrinking amap kmem consumption

2016-04-22 Thread Stefan Kempf
Stefan Kempf wrote:
> Next try. A problem with the last version was that amaps for shared
> mappings still would grow pretty large: establishing a shared mapping
> causes uvm to immediately allocate an amap for the whole range.

I messed up the last diff.

tb@ found and fixed bug in the libkvm bits that would abort a make
build. (thanks!).

Updated diff below.

Index: lib/libkvm/kvm_proc.c
===
RCS file: /cvs/src/lib/libkvm/kvm_proc.c,v
retrieving revision 1.52
diff -u -p -r1.52 kvm_proc.c
--- lib/libkvm/kvm_proc.c   22 Oct 2014 04:13:35 -  1.52
+++ lib/libkvm/kvm_proc.c   22 Apr 2016 18:08:51 -
@@ -108,6 +108,56 @@ static int proc_verify(kvm_t *, const st
 static voidps_str_a(struct ps_strings *, u_long *, int *);
 static voidps_str_e(struct ps_strings *, u_long *, int *);
 
+static struct vm_anon *
+_kvm_findanon(kvm_t *kd, struct vm_amap *amapp, int slot)
+{
+   u_long addr;
+   int bucket;
+   struct vm_amap amap;
+   struct vm_amap_chunk chunk, *chunkp;
+   struct vm_anon *anonp;
+
+   addr = (u_long)amapp;
+   if (KREAD(kd, addr, ))
+   return (NULL);
+
+   /* sanity-check slot number */
+   if (slot > amap.am_nslot)
+   return (NULL);
+
+   if (UVM_AMAP_SMALL())
+   chunkp = >am_small;
+   else {
+   bucket = UVM_AMAP_BUCKET(, slot);
+   addr = (u_long)(amap.am_buckets + bucket);
+   if (KREAD(kd, addr, ))
+   return (NULL);
+
+   while (chunkp != NULL) {
+   addr = (u_long)chunkp;
+   if (KREAD(kd, addr, ))
+   return (NULL);
+
+   if (UVM_AMAP_BUCKET(, chunk.ac_baseslot) !=
+   bucket)
+   return (NULL);
+   if (slot >= chunk.ac_baseslot &&
+   slot < chunk.ac_baseslot + chunk.ac_nslot)
+   break;
+
+   chunkp = TAILQ_NEXT(, ac_list);
+   }
+   if (chunkp == NULL)
+   return (NULL);
+   }
+
+   addr = (u_long)>ac_anon[UVM_AMAP_SLOTIDX(slot)];
+   if (KREAD(kd, addr, ))
+   return (NULL);
+
+   return (anonp);
+}
+
 static char *
 _kvm_ureadm(kvm_t *kd, const struct kinfo_proc *p, u_long va, u_long *cnt)
 {
@@ -115,7 +165,6 @@ _kvm_ureadm(kvm_t *kd, const struct kinf
struct vmspace vm;
struct vm_anon *anonp, anon;
struct vm_map_entry vme;
-   struct vm_amap amap;
struct vm_page pg;
 
if (kd->swapspc == 0) {
@@ -153,18 +202,11 @@ _kvm_ureadm(kvm_t *kd, const struct kinf
if (vme.aref.ar_amap == NULL)
return (NULL);
 
-   addr = (u_long)vme.aref.ar_amap;
-   if (KREAD(kd, addr, ))
-   return (NULL);
-
offset = va - vme.start;
slot = offset / kd->nbpg + vme.aref.ar_pageoff;
-   /* sanity-check slot number */
-   if (slot > amap.am_nslot)
-   return (NULL);
 
-   addr = (u_long)amap.am_anon + (offset / kd->nbpg) * sizeof(anonp);
-   if (KREAD(kd, addr, ))
+   anonp = _kvm_findanon(kd, vme.aref.ar_amap, slot);
+   if (anonp == NULL)
return (NULL);
 
addr = (u_long)anonp;
Index: sys/uvm/uvm_amap.c
===
RCS file: /cvs/src/sys/uvm/uvm_amap.c,v
retrieving revision 1.66
diff -u -p -r1.66 uvm_amap.c
--- sys/uvm/uvm_amap.c  16 Apr 2016 18:39:31 -  1.66
+++ sys/uvm/uvm_amap.c  22 Apr 2016 18:08:54 -
@@ -44,20 +44,19 @@
 #include 
 
 /*
- * pool for allocation of vm_map structures.  note that in order to
+ * pools for allocation of vm_amap structures.  note that in order to
  * avoid an endless loop, the amap pool's allocator cannot allocate
  * memory from an amap (it currently goes through the kernel uobj, so
  * we are ok).
  */
 
 struct pool uvm_amap_pool;
-
-/* Pools for amap slots for the most common amap slot sizes */
-struct pool uvm_amap_slot_pools[UVM_AMAP_CHUNK];
+struct pool uvm_small_amap_pool[UVM_AMAP_CHUNK];
+struct pool uvm_amap_chunk_pool;
 
 LIST_HEAD(, vm_amap) amap_list;
 
-static char amap_slot_pool_names[UVM_AMAP_CHUNK][13];
+static char amap_small_pool_names[UVM_AMAP_CHUNK][9];
 
 #define MALLOC_SLOT_UNIT (2 * sizeof(int) + sizeof(struct vm_anon *))
 
@@ -65,10 +64,12 @@ static char amap_slot_pool_names[UVM_AMA
  * local functions
  */
 
-static struct vm_amap *amap_alloc1(int, int);
+static struct vm_amap *amap_alloc1(int, int, int);
 static __inline void amap_list_insert(struct vm_amap *);
 static __inline void amap_list_remove(struct vm_amap *);   
 
+struct vm_amap_chunk *amap_chunk_get(struct vm_amap *, int, int);
+
 static __inline void
 amap_list_insert(struct vm_amap *

Re: uvm: shrinking amap kmem consumption

2016-04-22 Thread Stefan Kempf
Next try. A problem with the last version was that amaps for shared
mappings still would grow pretty large: establishing a shared mapping
causes uvm to immediately allocate an amap for the whole range.

That's a problem when changing vmm to use shared memory to allow vmd(8)
to access the memory of a guest with lots of RAM.

This diff allows amaps to organize slots in clusters that are
allocated on demand. A virtual memory range that only contains a few
pages will consume only small amounts of kmem for the amap.

- Amaps are clustered in chunks of 16 slots now, with a bitmap per
  chunk to keep track of used slots. That's cheaper than the current
  impl. Chunks are allocated b pool(9).
- The chunks are organized in a hash table. The number of buckets is
  computed such that the max. number of collisions is in log2(slots).
- Small amaps < 16 slots embed the cluster directly in the amap.
  The extra code to handle this special case is easy IMO. Though
  the amap definition got more ugly.
- libkvm needs to be recompiled as well because the amap layout has changed
- make -j4 build has become slightly faster on my machine

  21m13.20s real33m59.18s user  20m17.36s system
  vs.
  20m35.41s real33m56.55s user  18m55.17s system

With this diff it should be harder to run a machine out of kmem because
of large amap allocations caused by creative use of mmap().

So I welcome testing and comments.

Index: lib/libkvm/kvm_proc.c
===
RCS file: /cvs/src/lib/libkvm/kvm_proc.c,v
retrieving revision 1.52
diff -u -p -r1.52 kvm_proc.c
--- lib/libkvm/kvm_proc.c   22 Oct 2014 04:13:35 -  1.52
+++ lib/libkvm/kvm_proc.c   22 Apr 2016 15:49:54 -
@@ -108,6 +108,56 @@ static int proc_verify(kvm_t *, const st
 static voidps_str_a(struct ps_strings *, u_long *, int *);
 static voidps_str_e(struct ps_strings *, u_long *, int *);
 
+static struct vm_anon *
+_kvm_findanon(kvm_t *kd, struct vm_amap *amapp, int slot)
+{
+   u_long addr;
+   int bucket;
+   struct vm_amap amap;
+   struct vm_amap_chunk chunk, *chunkp;
+   struct vm_anon *anonp;
+
+   addr = (u_long)amapp;
+   if (KREAD(kd, addr, ))
+   return (NULL);
+
+   /* sanity-check slot number */
+   if (slot > amap.am_nslot)
+   return (NULL);
+
+   if (UVM_AMAP_SMALL())
+   chunkp = >am_small;
+   else {
+   bucket = AMAP_BUCKET(, slot);
+   addr = (u_long)(amap.am_buckets + bucket);
+   if (KREAD(kd, addr, ))
+   return (NULL);
+
+   while (chunkp != NULL) {
+   addr = (u_long)chunkp;
+   if (KREAD(kd, addr, ))
+   return (NULL);
+
+   if (UVM_AMAP_BUCKET(, chunk.ac_baseslot) !=
+   bucket)
+   return (NULL);
+   if (slot >= chunk.ac_baseslot &&
+   slot < chunk.ac_baseslot + chunk.ac_nslot)
+   break;
+
+   chunkp = TAILQ_NEXT(, ac_list);
+   }
+   if (chunkp == NULL)
+   return (NULL);
+   }
+
+   addr = (u_long)>ac_anon[UVM_AMAP_SLOTIDX(slot)];
+   if (KREAD(kd, addr, ))
+   return (NULL);
+
+   return (anonp);
+}
+
 static char *
 _kvm_ureadm(kvm_t *kd, const struct kinfo_proc *p, u_long va, u_long *cnt)
 {
@@ -115,7 +165,6 @@ _kvm_ureadm(kvm_t *kd, const struct kinf
struct vmspace vm;
struct vm_anon *anonp, anon;
struct vm_map_entry vme;
-   struct vm_amap amap;
struct vm_page pg;
 
if (kd->swapspc == 0) {
@@ -153,18 +202,11 @@ _kvm_ureadm(kvm_t *kd, const struct kinf
if (vme.aref.ar_amap == NULL)
return (NULL);
 
-   addr = (u_long)vme.aref.ar_amap;
-   if (KREAD(kd, addr, ))
-   return (NULL);
-
offset = va - vme.start;
slot = offset / kd->nbpg + vme.aref.ar_pageoff;
-   /* sanity-check slot number */
-   if (slot > amap.am_nslot)
-   return (NULL);
 
-   addr = (u_long)amap.am_anon + (offset / kd->nbpg) * sizeof(anonp);
-   if (KREAD(kd, addr, ))
+   anonp = _kvm_findanon(kd, vme.aref.ar_amap, slot);
+   if (anonp == NULL)
return (NULL);
 
addr = (u_long)anonp;
Index: sys/uvm/uvm_amap.c
===
RCS file: /cvs/src/sys/uvm/uvm_amap.c,v
retrieving revision 1.66
diff -u -p -r1.66 uvm_amap.c
--- sys/uvm/uvm_amap.c  16 Apr 2016 18:39:31 -  1.66
+++ sys/uvm/uvm_amap.c  22 Apr 2016 15:49:58 -
@@ -44,20 +44,19 @@
 #include 
 
 /*
- * pool for allocation of vm_map structures.  note that in order to
+ * pools for allocation of vm_amap structures.  note that in order to
  * avoid an endless loop, the amap 

libkvm: Make _kvm_uread inspect proper amap slot to read userspace page

2016-04-16 Thread Stefan Kempf
_kvm_uread maps a userspace page to a slot in the amap, but
only checks whether the slot is indeed within the amap.
It must also use the slot to extract the correct vm_anon in order
to find the physical address to read from.

The attached program shows that kvm_uread otherwise attempts reads
from an incorrect userspace address.

ok?

Index: lib/libkvm/kvm_proc.c
===
RCS file: /cvs/src/lib/libkvm/kvm_proc.c,v
retrieving revision 1.52
diff -u -p -r1.52 kvm_proc.c
--- lib/libkvm/kvm_proc.c   22 Oct 2014 04:13:35 -  1.52
+++ lib/libkvm/kvm_proc.c   16 Apr 2016 18:23:32 -
@@ -163,7 +163,7 @@ _kvm_ureadm(kvm_t *kd, const struct kinf
if (slot > amap.am_nslot)
return (NULL);
 
-   addr = (u_long)amap.am_anon + (offset / kd->nbpg) * sizeof(anonp);
+   addr = (u_long)(amap.am_anon + slot);
if (KREAD(kd, addr, ))
return (NULL);
 
/*
 * Compile with: gcc -O2 libkvmtest.c -o libkvmtest -lkvm
 * Run as root.
 */

#include 
#include 
#include 

#include 
#include 
#include 
#include 
#include 
#include 

#define ARGSIZE (2 * 4096)

#define DO_MUNMAP   1

int
main(int argc, char **argv)
{
int cnt, i;
char *p, **pargv;
kvm_t *kd;
struct kinfo_proc *kp;

p = mmap(NULL, ARGSIZE, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE,
-1, 0);
if (p == MAP_FAILED)
err(1, "mmap");

/*
 * Make sure that the kernel creates an amap the mapped range
 * by populating it with pages.
 */
memset(p, 0, ARGSIZE);

/*
 * Now unmap the first page. This causes the mapped
 * range to be split into two vm_map_entries. The
 * first entry is freed. The second vm_amp_entry represents
 * the second page in the range. The second map entry points
 * to the previously created amap. Since the amap still
 * contains two slots to represent the original range,
 * the second vm_map_entry must point into the amap with
 * an offset of 1 to refer to the slot representing the second
 * page.
 */
#if DO_MUNMAP
if (munmap(p, 4096) == -1)
err(1, "munmap");
#endif

p[4096] = 'A';
argv[0] = [4096];

kd = kvm_openfiles("./libkvmtest", "/dev/mem", NULL, O_RDONLY,
NULL);
if (kd == NULL)
errx(1, "kvm_open");
kp = kvm_getprocs(kd, KERN_PROC_PID, getpid(), sizeof *kp, );
if (kp == NULL || cnt != 1)
errx(1, "kvm_getprocs");

pargv = kvm_getargv(kd, kp, 8192);
if (pargv == NULL)
errx(1, "pargv");
for (i = 0; pargv[i] != NULL; i++)
printf("pargv[%d]: %s\n", i, pargv[i]);

return 0;

}


uvm: remove am_maxslot from amap

2016-04-16 Thread Stefan Kempf
am_maxslot represents the total number of slots an amap can be extended
to. Since we do not extend amaps, this field as well as rounding the
number of slots to the next malloc bucket is not useful.

This also removes the corresponding output from procmap(1).

ok?

Index: sys/uvm/uvm_amap.c
===
RCS file: /cvs/src/sys/uvm/uvm_amap.c,v
retrieving revision 1.65
diff -u -p -r1.65 uvm_amap.c
--- sys/uvm/uvm_amap.c  12 Apr 2016 16:47:33 -  1.65
+++ sys/uvm/uvm_amap.c  16 Apr 2016 14:29:55 -
@@ -180,44 +180,37 @@ static inline struct vm_amap *
 amap_alloc1(int slots, int waitf)
 {
struct vm_amap *amap;
-   int totalslots;
 
amap = pool_get(_amap_pool, (waitf == M_WAITOK) ? PR_WAITOK
: PR_NOWAIT);
if (amap == NULL)
return(NULL);
 
-   totalslots = slots;
-   KASSERT(totalslots > 0);
-
-   if (totalslots > UVM_AMAP_CHUNK)
-   totalslots = malloc_roundup(totalslots * MALLOC_SLOT_UNIT) /
-   MALLOC_SLOT_UNIT;
+   KASSERT(slots > 0);
 
amap->am_ref = 1;
amap->am_flags = 0;
 #ifdef UVM_AMAP_PPREF
amap->am_ppref = NULL;
 #endif
-   amap->am_maxslot = totalslots;
amap->am_nslot = slots;
amap->am_nused = 0;
 
-   if (totalslots > UVM_AMAP_CHUNK)
-   amap->am_slots = malloc(totalslots * MALLOC_SLOT_UNIT,
+   if (slots > UVM_AMAP_CHUNK)
+   amap->am_slots = malloc(slots * MALLOC_SLOT_UNIT,
M_UVMAMAP, waitf);
else
amap->am_slots = pool_get(
-   _amap_slot_pools[totalslots - 1],
+   _amap_slot_pools[slots - 1],
(waitf == M_WAITOK) ? PR_WAITOK : PR_NOWAIT);
 
if (amap->am_slots == NULL)
goto fail1;
 
-   amap->am_bckptr = (int *)(((char *)amap->am_slots) + totalslots *
+   amap->am_bckptr = (int *)(((char *)amap->am_slots) + slots *
sizeof(int));
amap->am_anon = (struct vm_anon **)(((char *)amap->am_bckptr) +
-   totalslots * sizeof(int));
+   slots * sizeof(int));
 
return(amap);
 
@@ -243,7 +236,7 @@ amap_alloc(vaddr_t sz, int waitf)
amap = amap_alloc1(slots, waitf);
if (amap) {
memset(amap->am_anon, 0,
-   amap->am_maxslot * sizeof(struct vm_anon *));
+   amap->am_nslot * sizeof(struct vm_anon *));
amap_list_insert(amap);
}
 
@@ -263,10 +256,10 @@ amap_free(struct vm_amap *amap)
KASSERT(amap->am_ref == 0 && amap->am_nused == 0);
KASSERT((amap->am_flags & AMAP_SWAPOFF) == 0);
 
-   if (amap->am_maxslot > UVM_AMAP_CHUNK)
+   if (amap->am_nslot > UVM_AMAP_CHUNK)
free(amap->am_slots, M_UVMAMAP, 0);
else
-   pool_put(_amap_slot_pools[amap->am_maxslot - 1],
+   pool_put(_amap_slot_pools[amap->am_nslot - 1],
amap->am_slots);
 
 #ifdef UVM_AMAP_PPREF
@@ -409,8 +402,7 @@ amap_copy(struct vm_map *map, struct vm_
amap->am_slots[amap->am_nused] = lcv;
amap->am_nused++;
}
-   memset(>am_anon[lcv], 0,
-   (amap->am_maxslot - lcv) * sizeof(struct vm_anon *));
+   KASSERT(lcv == amap->am_nslot);
 
/*
 * drop our reference to the old amap (srcamap).
@@ -570,7 +562,7 @@ void
 amap_pp_establish(struct vm_amap *amap)
 {
 
-   amap->am_ppref = mallocarray(amap->am_maxslot, sizeof(int),
+   amap->am_ppref = mallocarray(amap->am_nslot, sizeof(int),
M_UVMAMAP, M_NOWAIT|M_ZERO);
 
/* if we fail then we just won't use ppref for this amap */
Index: sys/uvm/uvm_amap.h
===
RCS file: /cvs/src/sys/uvm/uvm_amap.h,v
retrieving revision 1.23
diff -u -p -r1.23 uvm_amap.h
--- sys/uvm/uvm_amap.h  4 Apr 2016 16:34:16 -   1.23
+++ sys/uvm/uvm_amap.h  16 Apr 2016 14:29:55 -
@@ -124,8 +124,7 @@ boolean_t   amap_swap_off(int, int);
 struct vm_amap {
int am_ref; /* reference count */
int am_flags;   /* flags */
-   int am_maxslot; /* max # of slots allocated */
-   int am_nslot;   /* # of slots currently in map ( <= maxslot) */
+   int am_nslot;   /* # of slots currently in map */
int am_nused;   /* # of slots currently in use */
int *am_slots;  /* contig array of active slots */
int *am_bckptr; /* back pointer array to am_slots */
Index: usr.sbin/procmap/procmap.c
===
RCS file: /cvs/src/usr.sbin/procmap/procmap.c,v
retrieving revision 1.59
diff -u -p -r1.59 procmap.c
--- usr.sbin/procmap/procmap.c  19 Jan 2015 19:25:28 -  1.59
+++ usr.sbin/procmap/procmap.c  16 Apr 2016 14:29:56 -
@@ -97,7 +97,6 @@ rlim_t 

Re: uvm amap: Simplify amap traversal in amap_swap_off

2016-04-10 Thread Stefan Kempf
I'd like to commit this soon unless there are objections.

Stefan Kempf wrote:
> The recent uvm commits fixed hangs because machines went out of memory
> because of using too much space for amap slots.
> 
> It's possible to shrink memory requirements for amaps even more,
> but the current code needs some simplifications first. So here's another
> cleanup diff. There'll be one or two more before we get to the real stuff.
> 
> This one simplifies the traversal of an amap when swap is disabled.
> 
> There's no need to insert marker elements to find the next item in the
> amap list. The next amap can be determined by looking at the
> currently examined amap.
> 
> Care must be taken to get the next element before the current amap is
> possibly deleted, and after all the current amap's pages were read in
> from swap (because the page-in may sleep and remove items from the amap
> list).
> 
> ok?
> 
> Index: uvm/uvm_amap.c
> ===
> RCS file: /cvs/src/sys/uvm/uvm_amap.c,v
> retrieving revision 1.62
> diff -u -p -r1.62 uvm_amap.c
> --- uvm/uvm_amap.c16 Mar 2016 16:53:43 -  1.62
> +++ uvm/uvm_amap.c17 Mar 2016 18:08:29 -
> @@ -902,21 +902,11 @@ amap_swap_off(int startslot, int endslot
>  {
>   struct vm_amap *am;
>   struct vm_amap *am_next;
> - struct vm_amap marker_prev;
> - struct vm_amap marker_next;
>   boolean_t rv = FALSE;
>  
> -#if defined(DIAGNOSTIC)
> - memset(_prev, 0, sizeof(marker_prev));
> - memset(_next, 0, sizeof(marker_next));
> -#endif /* defined(DIAGNOSTIC) */
> -
>   for (am = LIST_FIRST(_list); am != NULL && !rv; am = am_next) {
>   int i;
>  
> - LIST_INSERT_BEFORE(am, _prev, am_list);
> - LIST_INSERT_AFTER(am, _next, am_list);
> -
>   for (i = 0; i < am->am_nused; i++) {
>   int slot;
>   int swslot;
> @@ -935,23 +925,14 @@ amap_swap_off(int startslot, int endslot
>   rv = uvm_anon_pagein(anon);
>  
>   am->am_flags &= ~AMAP_SWAPOFF;
> - if (amap_refs(am) == 0) {
> - amap_wipeout(am);
> - am = NULL;
> + if (rv || amap_refs(am) == 0)
>   break;
> - }
> - if (rv) {
> - break;
> - }
>   i = 0;
>   }
>  
> - KASSERT(LIST_NEXT(_prev, am_list) == _next ||
> - LIST_NEXT(LIST_NEXT(_prev, am_list), am_list) ==
> - _next);
> - am_next = LIST_NEXT(_next, am_list);
> - LIST_REMOVE(_prev, am_list);
> - LIST_REMOVE(_next, am_list);
> + am_next = LIST_NEXT(am, am_list);
> + if (amap_refs(am) == 0)
> + amap_wipeout(am);
>   }
>  
>   return rv;



Re: vmm: guest<->host communication via shared memory

2016-04-06 Thread Stefan Kempf
Stefan Kempf wrote:
> Hi,
> 
> here comes a diff for vmm, and I'd like to ask people that are
> interested in our hypervisor to test this. If you are experimenting
> with vmm already, just do what you always do with vmm when running
> with this diff :-)
> 
> [...]
> 
> This diff will not go in at once. The first thing that should be
> committed is an addition to uvm. I'll post that one separately in this
> thread and ask for reviews.

Here are just the uvm parts:
 
This diff has just the uvm parts with a new main function uvm_share()
and a helper function uvm_mapent_share(). Nothing uses it yet, but
vmm(4) will call it later. So this diff should have no impact on the
rest of uvm or the kernel.

What uvm_share() does is that it takes two virtual address ranges [A,B]
and [C,D], and makes sure that [A,B] and [C,D] both get mapped to
the same physical pages.

uvm already has the possibility to establish such shared mappings
(uvm_mapent_forkshared). I pulled out the common functionality
that uvm_share() needs as well into uvm_mapent_share() and made
uvm_mapent_clone a little more generic.

The only thing that uvm_share() does is that the source address
range [A,B] exists in the source address space, and that it is
backed by memory (whether it's anon memory or whether is comes
from a file does not matter). And the destination address range
[C, D] must still be available in the destination address space.

Comments, oks?

Background:

vmm(4) creates a separate (virtual) address space
for the guest VM. These guest physical addresses are then mapped to
"real" physical RAM on the host. But the memory for the guest is
currently allocated within the kernel and not directly visible to vmd.

With this diff, we can later have vmd(8) allocate a large chunk of
memory via mmap(), and have this memory correspond to a guest physical
range in the guest VM.

The protection bits for the two address ranges can be different. That
way, the allocated memory in vmd(8) will be non-executable. In the
guest itself however, the memory is executable.

Index: uvm/uvm_extern.h
===
RCS file: /cvs/src/sys/uvm/uvm_extern.h,v
retrieving revision 1.138
diff -u -p -r1.138 uvm_extern.h
--- uvm/uvm_extern.h4 Apr 2016 16:34:16 -   1.138
+++ uvm/uvm_extern.h6 Apr 2016 17:57:06 -
@@ -428,6 +428,8 @@ voiduvmspace_exec(struct proc *, 
vadd
 struct vmspace *uvmspace_fork(struct process *);
 void   uvmspace_free(struct vmspace *);
 struct vmspace *uvmspace_share(struct process *);
+intuvm_share(vm_map_t, vaddr_t, vm_prot_t,
+   vm_map_t, vaddr_t, vsize_t);
 void   uvm_meter(void);
 intuvm_sysctl(int *, u_int, void *, size_t *, 
void *, size_t, struct proc *);
Index: uvm/uvm_map.c
===
RCS file: /cvs/src/sys/uvm/uvm_map.c,v
retrieving revision 1.211
diff -u -p -r1.211 uvm_map.c
--- uvm/uvm_map.c   4 Apr 2016 16:34:16 -   1.211
+++ uvm/uvm_map.c   6 Apr 2016 17:57:06 -
@@ -182,8 +182,12 @@ int uvm_mapent_bias(struct 
vm_map*, s
  * uvm_vmspace_fork helper functions.
  */
 struct vm_map_entry*uvm_mapent_clone(struct vm_map*, vaddr_t, vsize_t,
-   vsize_t, struct vm_map_entry*,
-   struct uvm_map_deadq*, int, int);
+   vsize_t, vm_prot_t, vm_prot_t,
+   struct vm_map_entry*, struct uvm_map_deadq*, int,
+   int);
+struct vm_map_entry*uvm_mapent_share(struct vm_map*, vaddr_t, vsize_t,
+   vsize_t, vm_prot_t, vm_prot_t, struct vm_map*,
+   struct vm_map_entry*, struct uvm_map_deadq*);
 struct vm_map_entry*uvm_mapent_forkshared(struct vmspace*, struct vm_map*,
struct vm_map*, struct vm_map_entry*,
struct uvm_map_deadq*);
@@ -3364,6 +3368,98 @@ uvmspace_free(struct vmspace *vm)
 }
 
 /*
+ * uvm_share: Map the address range [srcaddr, srcaddr + sz) in
+ * srcmap to the address range [dstaddr, dstaddr + sz) in
+ * dstmap.
+ *
+ * The whole address range in srcmap must be backed by an object
+ * (no holes).
+ *
+ * If successful, the address ranges share memory and the destination
+ * address range uses the protection flags in prot.
+ *
+ * This routine assumes that sz is a multiple of PAGE_SIZE and
+ * that dstaddr and srcaddr are page-aligned.
+ */
+int
+uvm_share(struct vm_map *dstmap, vaddr_t dstaddr, vm_prot_t prot,
+struct vm_map *srcmap, vaddr_t srcaddr, vsize_t sz)
+{
+   int ret = 0;
+   vaddr_t unmap_end;
+   vaddr_t dstva;
+   vsize_t off, len, n = sz;
+   struct vm_map_entry *first = NULL, *last 

vmm: guest<->host communication via shared memory

2016-04-06 Thread Stefan Kempf
Hi,

here comes a diff for vmm, and I'd like to ask people that are
interested in our hypervisor to test this. If you are experimenting
with vmm already, just do what you always do with vmm when running
with this diff :-)

How to apply the diff:
$ cd /usr/src
$ patch < vmm.diff
$ doas make includes

Then rebuild and install kernel, vmd, and vmctl.

This diff will not go in at once. The first thing that should be
committed is an addition to uvm. I'll post that one separately in this
thread and ask for reviews.

Some details about the diff:

Currently, when vmd(8) needs to access the memory of the guest VM
(for example to copy the kernel image, read/write network packets, etc.),
it has to do readpage and writepage ioctls. For memory accesses
that are larger than one page, several such ioctls are needed.

A nicer way would be to have the guest memory accessible to the
userland vmd(8) via shared memory. That allows us to do memory
accesses through pointers and with ordinary loads/stores.

This works by having vmd(8) allocate guest memory via mmap(),
and pass these userspace pointers to vmm(4). vmm(4) then also
maps guest physical addresses to the mmap'd range.

It's similar to what KVM does, and has a few benefits:
- the RAM size of a VM is restricted to the max data size of vmd(8),
  and the kernel takes care of this. No need to invent a separate
  resource limitation mechanism for vmm(4)
- memory accesses between guest and host are faster, obviously

This diff:
- adds a new uvm subroutine uvm_share() that establishes a
  shared mapping of a memory range. vmm(4) calls this to
  map guest physical ranges to the mmap'd regions allocated
  by userspace.
- Makes vmm(4) use uvm_share instead of allocating virtual memory
  ranges for the guest itself
- Makes vmd(8) allocate memory for the guest VM
- Makes vmd(8) use memcpy instead of read/writepage ioctls
- removes the vm_readpage and vm_writepage ioctls from vmm(4)

Again, the uvm parts should go in first, and the vmm bits will
be finished afterwards.

Index: sys/arch/amd64/amd64/vmm.c
===
RCS file: /cvs/src/sys/arch/amd64/amd64/vmm.c,v
retrieving revision 1.48
diff -u -p -r1.48 vmm.c
--- sys/arch/amd64/amd64/vmm.c  6 Apr 2016 06:15:06 -   1.48
+++ sys/arch/amd64/amd64/vmm.c  6 Apr 2016 17:46:19 -
@@ -110,8 +110,6 @@ int vm_create(struct vm_create_params *,
 int vm_run(struct vm_run_params *);
 int vm_terminate(struct vm_terminate_params *);
 int vm_get_info(struct vm_info_params *);
-int vm_writepage(struct vm_writepage_params *);
-int vm_readpage(struct vm_readpage_params *);
 int vm_resetcpu(struct vm_resetcpu_params *);
 int vm_intr_pending(struct vm_intr_params *);
 int vcpu_reset_regs(struct vcpu *, struct vcpu_init_state *);
@@ -126,9 +124,9 @@ int vcpu_run_svm(struct vcpu *, uint8_t)
 void vcpu_deinit(struct vcpu *);
 void vcpu_deinit_vmx(struct vcpu *);
 void vcpu_deinit_svm(struct vcpu *);
-int vm_impl_init(struct vm *);
-int vm_impl_init_vmx(struct vm *);
-int vm_impl_init_svm(struct vm *);
+int vm_impl_init(struct vm *, struct proc *);
+int vm_impl_init_vmx(struct vm *, struct proc *);
+int vm_impl_init_svm(struct vm *, struct proc *);
 void vm_impl_deinit(struct vm *);
 void vm_impl_deinit_vmx(struct vm *);
 void vm_impl_deinit_svm(struct vm *);
@@ -344,12 +342,6 @@ vmmioctl(dev_t dev, u_long cmd, caddr_t 
case VMM_IOC_TERM:
ret = vm_terminate((struct vm_terminate_params *)data);
break;
-   case VMM_IOC_WRITEPAGE:
-   ret = vm_writepage((struct vm_writepage_params *)data);
-   break;
-   case VMM_IOC_READPAGE:
-   ret = vm_readpage((struct vm_readpage_params *)data);
-   break;
case VMM_IOC_RESETCPU:
ret = vm_resetcpu((struct vm_resetcpu_params *)data);
break;
@@ -383,8 +375,6 @@ pledge_ioctl_vmm(struct proc *p, long co
case VMM_IOC_TERM:
/* XXX VM processes should only terminate themselves */
case VMM_IOC_RUN:
-   case VMM_IOC_WRITEPAGE:
-   case VMM_IOC_READPAGE:
case VMM_IOC_RESETCPU:
return (0);
}
@@ -404,89 +394,6 @@ vmmclose(dev_t dev, int flag, int mode, 
 }
 
 /*
- * vm_readpage
- *
- * Reads a region (PAGE_SIZE max) of guest physical memory using the parameters
- * defined in 'vrp'.
- *
- * Returns 0 if successful, or various error codes on failure:
- *  ENOENT if the VM id contained in 'vrp' refers to an unknown VM
- *  EINVAL if the memory region described by vrp is not regular memory
- *  EFAULT if the memory region described by vrp has not yet been faulted in
- *  by the guest
- */
-int
-vm_readpage(struct vm_readpage_params *vrp)
-{
-   struct vm *vm;
-   paddr_t host_pa;
-   void *kva;
-   vaddr_t vr_page;
-
-   /* Find the desired VM */
-   rw_enter_read(_softc->vm_lock);
-   SLIST_FOREACH(vm, _softc->vm_list, vm_link) {
-   

uvm: UVM_FLAG_AMAPPAD has no effect anymore, nuke it

2016-04-03 Thread Stefan Kempf
This flag caused amaps to be allocated with additional spare slots, to
make extending them cheaper. However, the kernel never extends amaps,
so allocating spare slots is pointless. Also UVM_FLAG_AMAPPAD only
has an effect in combination with UVM_FLAG_OVERLAY. The only function
that used both flags was sys_obreak, but that function had the use of
UVM_FLAG_OVERLAY removed recently.

While there, kill the unused prototypes amap_flags and amap_refs.
They're defined as macros already.

ok?

Index: uvm/uvm_amap.c
===
RCS file: /cvs/src/sys/uvm/uvm_amap.c,v
retrieving revision 1.63
diff -u -p -r1.63 uvm_amap.c
--- uvm/uvm_amap.c  27 Mar 2016 09:51:37 -  1.63
+++ uvm/uvm_amap.c  3 Apr 2016 14:00:29 -
@@ -65,7 +65,7 @@ static char amap_slot_pool_names[UVM_AMA
  * local functions
  */
 
-static struct vm_amap *amap_alloc1(int, int, int);
+static struct vm_amap *amap_alloc1(int, int);
 static __inline void amap_list_insert(struct vm_amap *);
 static __inline void amap_list_remove(struct vm_amap *);   
 
@@ -177,7 +177,7 @@ amap_init(void)
  * init the overlay.
  */
 static inline struct vm_amap *
-amap_alloc1(int slots, int padslots, int waitf)
+amap_alloc1(int slots, int waitf)
 {
struct vm_amap *amap;
int totalslots;
@@ -187,7 +187,7 @@ amap_alloc1(int slots, int padslots, int
if (amap == NULL)
return(NULL);
 
-   totalslots = slots + padslots;
+   totalslots = slots;
KASSERT(totalslots > 0);
 
if (totalslots > UVM_AMAP_CHUNK)
@@ -233,15 +233,14 @@ fail1:
  * => reference count to new amap is set to one
  */
 struct vm_amap *
-amap_alloc(vaddr_t sz, vaddr_t padsz, int waitf)
+amap_alloc(vaddr_t sz, int waitf)
 {
struct vm_amap *amap;
-   int slots, padslots;
+   int slots;
 
AMAP_B2SLOT(slots, sz); /* load slots */
-   AMAP_B2SLOT(padslots, padsz);
 
-   amap = amap_alloc1(slots, padslots, waitf);
+   amap = amap_alloc1(slots, waitf);
if (amap) {
memset(amap->am_anon, 0,
amap->am_maxslot * sizeof(struct vm_anon *));
@@ -361,7 +360,7 @@ amap_copy(struct vm_map *map, struct vm_
}
 
entry->aref.ar_pageoff = 0;
-   entry->aref.ar_amap = amap_alloc(entry->end - entry->start, 0,
+   entry->aref.ar_amap = amap_alloc(entry->end - entry->start,
waitf);
if (entry->aref.ar_amap != NULL)
entry->etype &= ~UVM_ET_NEEDSCOPY;
@@ -381,7 +380,7 @@ amap_copy(struct vm_map *map, struct vm_
 
/* looks like we need to copy the map. */
AMAP_B2SLOT(slots, entry->end - entry->start);
-   amap = amap_alloc1(slots, 0, waitf);
+   amap = amap_alloc1(slots, waitf);
if (amap == NULL)
return;
srcamap = entry->aref.ar_amap;
Index: uvm/uvm_amap.h
===
RCS file: /cvs/src/sys/uvm/uvm_amap.h,v
retrieving revision 1.22
diff -u -p -r1.22 uvm_amap.h
--- uvm/uvm_amap.h  27 Mar 2016 09:51:37 -  1.22
+++ uvm/uvm_amap.h  3 Apr 2016 14:00:29 -
@@ -66,15 +66,13 @@ struct vm_amap;
 void   amap_add(struct vm_aref *, vaddr_t, struct vm_anon *,
boolean_t);
/* allocate a new amap */
-struct vm_amap *amap_alloc(vaddr_t, vaddr_t, int);
+struct vm_amap *amap_alloc(vaddr_t, int);
/* clear amap needs-copy flag */
 void   amap_copy(vm_map_t, vm_map_entry_t, int, boolean_t, vaddr_t,
vaddr_t);
/* resolve all COW faults now */
 void   amap_cow_now(vm_map_t, vm_map_entry_t);
/* get amap's flags */
-intamap_flags(struct vm_amap *);
-   /* free amap */
 void   amap_free(struct vm_amap *);
/* init amap module (at boot time) */
 void   amap_init(void);
@@ -85,8 +83,6 @@ void  amap_lookups(struct vm_aref *, vad
/* add a reference to an amap */
 void   amap_ref(struct vm_amap *, vaddr_t, vsize_t, int);
/* get number of references of amap */
-intamap_refs(struct vm_amap *);
-   /* split reference to amap into two */
 void   amap_splitref(struct vm_aref *, struct vm_aref *, vaddr_t);
/* remove an anon from an amap */
 void   amap_unadd(struct vm_aref *, vaddr_t);
Index: uvm/uvm_extern.h
===
RCS file: /cvs/src/sys/uvm/uvm_extern.h,v
retrieving revision 1.137
diff -u -p -r1.137 uvm_extern.h
--- uvm/uvm_extern.h2 

Re: uvm: enable amap per-page refcounting unconditionally

2016-03-28 Thread Stefan Kempf
Miod Vallat wrote:
> 
> > It seems per-page reference counting is used since forever. I think
> > there's no reason to ever turn it off (and track referenced pages
> > with less accuracy, causing leaks).
> 
> Actually, assuming the #undef code path works, it might work keeping
> this and only defining UVM_AMAP_PPREF iff defined(SMALL_KERNEL).

Doing this saves around 1.6K on bsd.rd/amd64.

Would that be preferred over removing the #ifdefs?

textdatabss dec hex
4736948 2409000 577536  7723484 75d9dc
4738636 2409000 577536  7725172 75e074
 
diff --git a/uvm/uvm_amap.h b/uvm/uvm_amap.h
index a98b440..a768e94 100644
--- a/uvm/uvm_amap.h
+++ b/uvm/uvm_amap.h
@@ -119,7 +119,9 @@ boolean_t   amap_swap_off(int, int);
  * ... this is enabled with the "UVM_AMAP_PPREF" define.
  */
 
-#define UVM_AMAP_PPREF /* track partial references */
+#ifndef SMALL_KERNEL
+# define UVM_AMAP_PPREF/* track partial references */
+#endif
 
 /*
  * here is the definition of the vm_amap structure for this implementation.



uvm: enable amap per-page refcounting unconditionally

2016-03-27 Thread Stefan Kempf
It seems per-page reference counting is used since forever. I think
there's no reason to ever turn it off (and track referenced pages
with less accuracy, causing leaks).

So remove those #ifdefs.

ok?

Index: uvm/uvm_amap.c
===
RCS file: /cvs/src/sys/uvm/uvm_amap.c,v
retrieving revision 1.63
diff -u -p -r1.63 uvm_amap.c
--- uvm/uvm_amap.c  27 Mar 2016 09:51:37 -  1.63
+++ uvm/uvm_amap.c  27 Mar 2016 12:09:16 -
@@ -81,11 +81,9 @@ amap_list_remove(struct vm_amap *amap)
LIST_REMOVE(amap, am_list);
 }
 
-#ifdef UVM_AMAP_PPREF
 /*
- * what is ppref?   ppref is an _optional_ amap feature which is used
- * to keep track of reference counts on a per-page basis.  it is enabled
- * when UVM_AMAP_PPREF is defined.
+ * what is ppref?   ppref is an amap feature which is used
+ * to keep track of reference counts on a per-page basis.
  *
  * when enabled, an array of ints is allocated for the pprefs.  this
  * array is allocated only when a partial reference is added to the
@@ -147,7 +145,6 @@ pp_setreflen(int *ppref, int offset, int
ppref[offset+1] = len;
}
 }
-#endif
 
 /*
  * amap_init: called at boot time to init global amap data structures
@@ -196,9 +193,7 @@ amap_alloc1(int slots, int padslots, int
 
amap->am_ref = 1;
amap->am_flags = 0;
-#ifdef UVM_AMAP_PPREF
amap->am_ppref = NULL;
-#endif
amap->am_maxslot = totalslots;
amap->am_nslot = slots;
amap->am_nused = 0;
@@ -270,10 +265,8 @@ amap_free(struct vm_amap *amap)
pool_put(_amap_slot_pools[amap->am_maxslot - 1],
amap->am_slots);
 
-#ifdef UVM_AMAP_PPREF
if (amap->am_ppref && amap->am_ppref != PPREF_NONE)
free(amap->am_ppref, M_UVMAMAP, 0);
-#endif
pool_put(_amap_pool, amap);
 
 }
@@ -422,12 +415,10 @@ amap_copy(struct vm_map *map, struct vm_
srcamap->am_ref--;
if (srcamap->am_ref == 1 && (srcamap->am_flags & AMAP_SHARED) != 0)
srcamap->am_flags &= ~AMAP_SHARED;   /* clear shared flag */
-#ifdef UVM_AMAP_PPREF
if (srcamap->am_ppref && srcamap->am_ppref != PPREF_NONE) {
amap_pp_adjref(srcamap, entry->aref.ar_pageoff, 
(entry->end - entry->start) >> PAGE_SHIFT, -1);
}
-#endif
 
/* install new amap. */
entry->aref.ar_pageoff = 0;
@@ -551,19 +542,15 @@ amap_splitref(struct vm_aref *origref, s
if (origref->ar_amap->am_nslot - origref->ar_pageoff - leftslots <= 0)
panic("amap_splitref: map size check failed");
 
-#ifdef UVM_AMAP_PPREF
 /* establish ppref before we add a duplicate reference to the amap */
if (origref->ar_amap->am_ppref == NULL)
amap_pp_establish(origref->ar_amap);
-#endif
 
splitref->ar_amap = origref->ar_amap;
splitref->ar_amap->am_ref++;/* not a share reference */
splitref->ar_pageoff = origref->ar_pageoff + leftslots;
 }
 
-#ifdef UVM_AMAP_PPREF
-
 /*
  * amap_pp_establish: add a ppref array to an amap, if possible
  */
@@ -719,8 +706,6 @@ amap_wiperange(struct vm_amap *amap, int
}
 }
 
-#endif
-
 /*
  * amap_swap_off: pagein anonymous pages in amaps and drop swap slots.
  *
@@ -911,7 +896,6 @@ amap_ref(struct vm_amap *amap, vaddr_t o
amap->am_ref++;
if (flags & AMAP_SHARED)
amap->am_flags |= AMAP_SHARED;
-#ifdef UVM_AMAP_PPREF
if (amap->am_ppref == NULL && (flags & AMAP_REFALL) == 0 &&
len != amap->am_nslot)
amap_pp_establish(amap);
@@ -921,7 +905,6 @@ amap_ref(struct vm_amap *amap, vaddr_t o
else
amap_pp_adjref(amap, offset, len, 1);
}
-#endif
 }
 
 /*
@@ -945,7 +928,6 @@ amap_unref(struct vm_amap *amap, vaddr_t
/* otherwise just drop the reference count(s) */
if (amap->am_ref == 1 && (amap->am_flags & AMAP_SHARED) != 0)
amap->am_flags &= ~AMAP_SHARED; /* clear shared flag */
-#ifdef UVM_AMAP_PPREF
if (amap->am_ppref == NULL && all == 0 && len != amap->am_nslot)
amap_pp_establish(amap);
if (amap->am_ppref && amap->am_ppref != PPREF_NONE) {
@@ -954,5 +936,4 @@ amap_unref(struct vm_amap *amap, vaddr_t
else
amap_pp_adjref(amap, offset, len, -1);
}
-#endif
 }
Index: uvm/uvm_amap.h
===
RCS file: /cvs/src/sys/uvm/uvm_amap.h,v
retrieving revision 1.22
diff -u -p -r1.22 uvm_amap.h
--- uvm/uvm_amap.h  27 Mar 2016 09:51:37 -  1.22
+++ uvm/uvm_amap.h  27 Mar 2016 12:09:16 -
@@ -114,13 +114,10 @@ boolean_t amap_swap_off(int, int);
 
 /*
  * we currently provide an array-based amap implementation.  in this
- * implementation we provide the option of tracking split references
- * so that we don't lose track of references during 

Re: uvm: remove unused(?) amap_extend

2016-03-27 Thread Stefan Kempf
Martin Pieuchot wrote:
> On 26/03/16(Sat) 19:19, Stefan Kempf wrote:
> > Stefan Kempf wrote:
> > > amap_extend is called when merging two adjacent areas of virtual address
> > > space. However, merging is done only for kernel
> > > virtual address space. It's not done for user space:
> > > 
> > > uvm/uvm_map.c:1359
> > >   /*
> > >* Try to merge entry.
> > >*
> > >* Userland allocations are kept separated most of the time.
> > >* Forego the effort of merging what most of the time can't be merged
> > >* and only try the merge if it concerns a kernel entry.
> > >*/
> > >   if ((flags & UVM_FLAG_NOMERGE) == 0 &&
> > >   (map->flags & VM_MAP_ISVMSPACE) == 0)
> > >   uvm_mapent_tryjoin(map, entry, );
> > > 
> > > 
> > > As far as I can tell, kernel vm_map_entries do not use amaps. So
> > > amap_extend should never be called. Can we remove it?
> > 
> > Any objections against committing this diff?
> > 
> > Merging vm map entries for userspace was turned off a long time
> > ago in r1.104 of uvm/uvm_map.c.
> > 
> > The kernel doesn't use amaps either. Removing this would make the amap
> > shrink diffs simpler.
> 
> If it should never happen, what about putting a KASSERT() rather than
> returning NULL?

Good idea, IMO.

Index: uvm/uvm_map.c
===
RCS file: /cvs/src/sys/uvm/uvm_map.c,v
retrieving revision 1.209
diff -u -p -r1.209 uvm_map.c
--- uvm/uvm_map.c   15 Mar 2016 20:50:23 -  1.209
+++ uvm/uvm_map.c   27 Mar 2016 06:56:32 -
@@ -1448,14 +1448,14 @@ uvm_mapent_merge(struct vm_map *map, str
struct uvm_addr_state *free;
 
/*
-* Amap of e1 must be extended to include e2.
+* Merging is not supported for map entries that
+* contain an amap in e1. This should never happen
+* anyway, because only kernel entries are merged.
+* These do not contain amaps.
 * e2 contains no real information in its amap,
 * so it can be erased immediately.
 */
-   if (e1->aref.ar_amap) {
-   if (amap_extend(e1, e2->end - e2->start))
-   return NULL;
-   }
+   KASSERT(e1->aref.ar_amap == NULL);
 
/*
 * Don't drop obj reference:
 
> > > Index: uvm/uvm_amap.c
> > > ===
> > > RCS file: /cvs/src/sys/uvm/uvm_amap.c,v
> > > retrieving revision 1.62
> > > diff -u -p -r1.62 uvm_amap.c
> > > --- uvm/uvm_amap.c16 Mar 2016 16:53:43 -  1.62
> > > +++ uvm/uvm_amap.c23 Mar 2016 17:03:53 -
> > > @@ -279,174 +279,6 @@ amap_free(struct vm_amap *amap)
> > >  }
> > >  
> > >  /*
> > > - * amap_extend: extend the size of an amap (if needed)
> > > - *
> > > - * => called from uvm_map when we want to extend an amap to cover
> > > - *a new mapping (rather than allocate a new one)
> > > - * => to safely extend an amap it should have a reference count of
> > > - *one (thus it can't be shared)
> > > - * => XXXCDC: support padding at this level?
> > > - */
> > > -int
> > > -amap_extend(struct vm_map_entry *entry, vsize_t addsize)
> > > -{
> > > - struct vm_amap *amap = entry->aref.ar_amap;
> > > - int slotoff = entry->aref.ar_pageoff;
> > > - int slotmapped, slotadd, slotneed, slotalloc;
> > > -#ifdef UVM_AMAP_PPREF
> > > - int *newppref, *oldppref;
> > > -#endif
> > > - u_int *newsl, *newbck, *oldsl, *oldbck;
> > > - struct vm_anon **newover, **oldover;
> > > - int slotadded;
> > > -
> > > - /*
> > > -  * first, determine how many slots we need in the amap.  don't
> > > -  * forget that ar_pageoff could be non-zero: this means that
> > > -  * there are some unused slots before us in the amap.
> > > -  */
> > > - AMAP_B2SLOT(slotmapped, entry->end - entry->start); /* slots mapped */
> > > - AMAP_B2SLOT(slotadd, addsize);  /* slots to add */
> > > - slotneed = slotoff + slotmapped + slotadd;
> > > -
> > > - /*
> > > -  * case 1: we already have enough slots in the map and thus
> > > -  * only need to bump the reference counts on the slots we are
> > > -  * adding.
> > > -  */
> > > - if (amap->am_nslot >= slotneed) {
> > > -#ifdef UVM_AMAP_PPREF
> > > 

Re: uvm: remove unused(?) amap_extend

2016-03-26 Thread Stefan Kempf
Stefan Kempf wrote:
> amap_extend is called when merging two adjacent areas of virtual address
> space. However, merging is done only for kernel
> virtual address space. It's not done for user space:
> 
> uvm/uvm_map.c:1359
>   /*
>* Try to merge entry.
>*
>* Userland allocations are kept separated most of the time.
>* Forego the effort of merging what most of the time can't be merged
>* and only try the merge if it concerns a kernel entry.
>*/
>   if ((flags & UVM_FLAG_NOMERGE) == 0 &&
>   (map->flags & VM_MAP_ISVMSPACE) == 0)
>   uvm_mapent_tryjoin(map, entry, );
> 
> 
> As far as I can tell, kernel vm_map_entries do not use amaps. So
> amap_extend should never be called. Can we remove it?

Any objections against committing this diff?

Merging vm map entries for userspace was turned off a long time
ago in r1.104 of uvm/uvm_map.c.

The kernel doesn't use amaps either. Removing this would make the amap
shrink diffs simpler.

> Index: uvm/uvm_amap.c
> ===
> RCS file: /cvs/src/sys/uvm/uvm_amap.c,v
> retrieving revision 1.62
> diff -u -p -r1.62 uvm_amap.c
> --- uvm/uvm_amap.c16 Mar 2016 16:53:43 -  1.62
> +++ uvm/uvm_amap.c23 Mar 2016 17:03:53 -
> @@ -279,174 +279,6 @@ amap_free(struct vm_amap *amap)
>  }
>  
>  /*
> - * amap_extend: extend the size of an amap (if needed)
> - *
> - * => called from uvm_map when we want to extend an amap to cover
> - *a new mapping (rather than allocate a new one)
> - * => to safely extend an amap it should have a reference count of
> - *one (thus it can't be shared)
> - * => XXXCDC: support padding at this level?
> - */
> -int
> -amap_extend(struct vm_map_entry *entry, vsize_t addsize)
> -{
> - struct vm_amap *amap = entry->aref.ar_amap;
> - int slotoff = entry->aref.ar_pageoff;
> - int slotmapped, slotadd, slotneed, slotalloc;
> -#ifdef UVM_AMAP_PPREF
> - int *newppref, *oldppref;
> -#endif
> - u_int *newsl, *newbck, *oldsl, *oldbck;
> - struct vm_anon **newover, **oldover;
> - int slotadded;
> -
> - /*
> -  * first, determine how many slots we need in the amap.  don't
> -  * forget that ar_pageoff could be non-zero: this means that
> -  * there are some unused slots before us in the amap.
> -  */
> - AMAP_B2SLOT(slotmapped, entry->end - entry->start); /* slots mapped */
> - AMAP_B2SLOT(slotadd, addsize);  /* slots to add */
> - slotneed = slotoff + slotmapped + slotadd;
> -
> - /*
> -  * case 1: we already have enough slots in the map and thus
> -  * only need to bump the reference counts on the slots we are
> -  * adding.
> -  */
> - if (amap->am_nslot >= slotneed) {
> -#ifdef UVM_AMAP_PPREF
> - if (amap->am_ppref && amap->am_ppref != PPREF_NONE) {
> - amap_pp_adjref(amap, slotoff + slotmapped, slotadd, 1);
> - }
> -#endif
> - return (0);
> - }
> -
> - /*
> -  * case 2: we pre-allocated slots for use and we just need to
> -  * bump nslot up to take account for these slots.
> -  */
> - if (amap->am_maxslot >= slotneed) {
> -#ifdef UVM_AMAP_PPREF
> - if (amap->am_ppref && amap->am_ppref != PPREF_NONE) {
> - if ((slotoff + slotmapped) < amap->am_nslot)
> - amap_pp_adjref(amap, slotoff + slotmapped, 
> - (amap->am_nslot - (slotoff + slotmapped)),
> - 1);
> - pp_setreflen(amap->am_ppref, amap->am_nslot, 1, 
> -slotneed - amap->am_nslot);
> - }
> -#endif
> - amap->am_nslot = slotneed;
> - /*
> -  * no need to zero am_anon since that was done at
> -  * alloc time and we never shrink an allocation.
> -  */
> - return (0);
> - }
> -
> - /*
> -  * case 3: we need to malloc a new amap and copy all the amap
> -  * data over from old amap to the new one.
> -  *
> -  * XXXCDC: could we take advantage of a kernel realloc()?  
> -  */
> - if (slotneed >= UVM_AMAP_LARGE)
> - return E2BIG;
> -
> - if (slotneed > UVM_AMAP_CHUNK)
> - slotalloc = malloc_roundup(slotneed * MALLOC_SLOT_UNIT) /
> - MALLOC_SLOT_UNIT;
> - else
> - slotalloc = slotneed

uvm: remove unused(?) amap_extend

2016-03-23 Thread Stefan Kempf
amap_extend is called when merging two adjacent areas of virtual address
space. However, merging is done only for kernel
virtual address space. It's not done for user space:

uvm/uvm_map.c:1359
/*
 * Try to merge entry.
 *
 * Userland allocations are kept separated most of the time.
 * Forego the effort of merging what most of the time can't be merged
 * and only try the merge if it concerns a kernel entry.
 */
if ((flags & UVM_FLAG_NOMERGE) == 0 &&
(map->flags & VM_MAP_ISVMSPACE) == 0)
uvm_mapent_tryjoin(map, entry, );


As far as I can tell, kernel vm_map_entries do not use amaps. So
amap_extend should never be called. Can we remove it?

Index: uvm/uvm_amap.c
===
RCS file: /cvs/src/sys/uvm/uvm_amap.c,v
retrieving revision 1.62
diff -u -p -r1.62 uvm_amap.c
--- uvm/uvm_amap.c  16 Mar 2016 16:53:43 -  1.62
+++ uvm/uvm_amap.c  23 Mar 2016 17:03:53 -
@@ -279,174 +279,6 @@ amap_free(struct vm_amap *amap)
 }
 
 /*
- * amap_extend: extend the size of an amap (if needed)
- *
- * => called from uvm_map when we want to extend an amap to cover
- *a new mapping (rather than allocate a new one)
- * => to safely extend an amap it should have a reference count of
- *one (thus it can't be shared)
- * => XXXCDC: support padding at this level?
- */
-int
-amap_extend(struct vm_map_entry *entry, vsize_t addsize)
-{
-   struct vm_amap *amap = entry->aref.ar_amap;
-   int slotoff = entry->aref.ar_pageoff;
-   int slotmapped, slotadd, slotneed, slotalloc;
-#ifdef UVM_AMAP_PPREF
-   int *newppref, *oldppref;
-#endif
-   u_int *newsl, *newbck, *oldsl, *oldbck;
-   struct vm_anon **newover, **oldover;
-   int slotadded;
-
-   /*
-* first, determine how many slots we need in the amap.  don't
-* forget that ar_pageoff could be non-zero: this means that
-* there are some unused slots before us in the amap.
-*/
-   AMAP_B2SLOT(slotmapped, entry->end - entry->start); /* slots mapped */
-   AMAP_B2SLOT(slotadd, addsize);  /* slots to add */
-   slotneed = slotoff + slotmapped + slotadd;
-
-   /*
-* case 1: we already have enough slots in the map and thus
-* only need to bump the reference counts on the slots we are
-* adding.
-*/
-   if (amap->am_nslot >= slotneed) {
-#ifdef UVM_AMAP_PPREF
-   if (amap->am_ppref && amap->am_ppref != PPREF_NONE) {
-   amap_pp_adjref(amap, slotoff + slotmapped, slotadd, 1);
-   }
-#endif
-   return (0);
-   }
-
-   /*
-* case 2: we pre-allocated slots for use and we just need to
-* bump nslot up to take account for these slots.
-*/
-   if (amap->am_maxslot >= slotneed) {
-#ifdef UVM_AMAP_PPREF
-   if (amap->am_ppref && amap->am_ppref != PPREF_NONE) {
-   if ((slotoff + slotmapped) < amap->am_nslot)
-   amap_pp_adjref(amap, slotoff + slotmapped, 
-   (amap->am_nslot - (slotoff + slotmapped)),
-   1);
-   pp_setreflen(amap->am_ppref, amap->am_nslot, 1, 
-  slotneed - amap->am_nslot);
-   }
-#endif
-   amap->am_nslot = slotneed;
-   /*
-* no need to zero am_anon since that was done at
-* alloc time and we never shrink an allocation.
-*/
-   return (0);
-   }
-
-   /*
-* case 3: we need to malloc a new amap and copy all the amap
-* data over from old amap to the new one.
-*
-* XXXCDC: could we take advantage of a kernel realloc()?  
-*/
-   if (slotneed >= UVM_AMAP_LARGE)
-   return E2BIG;
-
-   if (slotneed > UVM_AMAP_CHUNK)
-   slotalloc = malloc_roundup(slotneed * MALLOC_SLOT_UNIT) /
-   MALLOC_SLOT_UNIT;
-   else
-   slotalloc = slotneed;
-
-#ifdef UVM_AMAP_PPREF
-   newppref = NULL;
-   if (amap->am_ppref && amap->am_ppref != PPREF_NONE) {
-   newppref = mallocarray(slotalloc, sizeof(int), M_UVMAMAP,
-   M_WAITOK | M_CANFAIL);
-   if (newppref == NULL) {
-   /* give up if malloc fails */
-   free(amap->am_ppref, M_UVMAMAP, 0);
-   amap->am_ppref = PPREF_NONE;
-   }
-   }
-#endif
-   if (slotneed > UVM_AMAP_CHUNK)
-   newsl = malloc(slotalloc * MALLOC_SLOT_UNIT, M_UVMAMAP,
-   M_WAITOK | M_CANFAIL);
-   else
-   newsl = pool_get(_amap_slot_pools[slotalloc - 1],
-   PR_WAITOK | PR_LIMITFAIL);
-   if (newsl == NULL) {
-#ifdef UVM_AMAP_PPREF
-

Re: uvm: shrinking amap kmem consumption

2016-03-22 Thread Stefan Kempf
Mark Kettenis wrote:
> > Date: Mon, 21 Mar 2016 20:02:28 +0100
> > From: Stefan Kempf <sisnk...@gmail.com>
> > 
> > Recently we found that amaps consume a good deal of kernel address space.
> > See this thread: https://marc.info/?l=openbsd-tech=145752756005014=2.
> > And we found a way to reduce kernel mem pressure for some architectures
> > at least. See the diffs in that thread.
> > 
> > Besides that, it's possible to shrink the amap struct from 72 to 48 bytes
> > on 64 bit systems (or from 44 to 32 bytes on 32 bit systems).
> > 
> > It's also possible to cut down the memory needed for slots roughly in half
> > on 64 bit architectures, and cut it up to a factor of 3 on 32 bit machines.
> > 
> > Here's how amap slots are maintained currently: for every slot, the kernel
> > allocates one pointer to a vm_anon and two ints (16 bytes per slot on
> > 64 bit systems, 12 bytes for 32 CPUs).
> > 
> > To reduce these memory requirements, we need three flavors of amaps:
> > 
> > - Tiny amaps with only one slot store the pointer to the vm_anon in the
> >   amap directly. The two ints are not needed. This was Theo's idea.
> > 
> > - Small amaps with up to 32 slots need 8 instead of 16 bytes per slot
> >   (or 4 bytes instead of 12 on 32 bit machines).
> >   It's enough to store the array of anons. The two ints per slot are
> >   not needed.
> > 
> >   Tiny and small amaps are the ones used most often.
> > 
> > - Normal amaps with n >= 32 slots only need
> >   4 * sizeof(pointer) + n * sizeof(struct vm_anon *) + 12*n/32 bytes
> >   to maintain amap slots. For large n that's also around 1.8 times
> >   less memory for slots (or about 2.7 times less on 32 bit CPUs) compared
> >   to the current implementation.
> >   That memory is for the vm_anon array, and a header structure. The two
> >   ints per slot in the current code are replaced by n/32 bitmaps.
> > 
> [...]
> 
> Is it really worth having three flavours?  If having only two
> (tiny+normal?) simplifies the code considerable and doesn't result in
> much more memory being used, that may be preferable.

I think it's possible to simplify the current diff a little more.
If it's still considered too complex then doing small and normal
only would be an option.

I'll experiment some more.
 
> The amaps are one of the roadblocks on the way to make uvm more
> mpsafe.  And keeping the code simple will make that easier.
> 
> > Index: uvm/uvm_amap.c
> > ===
> > RCS file: /cvs/src/sys/uvm/uvm_amap.c,v
> > retrieving revision 1.62
> > diff -u -p -r1.62 uvm_amap.c
> > --- uvm/uvm_amap.c  16 Mar 2016 16:53:43 -  1.62
> > +++ uvm/uvm_amap.c  21 Mar 2016 18:53:45 -
> > @@ -53,21 +53,31 @@
> >  struct pool uvm_amap_pool;
> >  
> >  /* Pools for amap slots for the most common amap slot sizes */
> > -struct pool uvm_amap_slot_pools[UVM_AMAP_CHUNK];
> > +struct pool uvm_amap_slot_pools[UVM_AMAP_CHUNK - 1];
> >  
> >  LIST_HEAD(, vm_amap) amap_list;
> >  
> > -static char amap_slot_pool_names[UVM_AMAP_CHUNK][13];
> > -
> > -#define MALLOC_SLOT_UNIT (2 * sizeof(int) + sizeof(struct vm_anon *))
> > +static char amap_slot_pool_names[UVM_AMAP_CHUNK - 1][13];
> >  
> >  /*
> >   * local functions
> >   */
> >  
> > -static struct vm_amap *amap_alloc1(int, int, int);
> > +struct vm_anon **amap_slots_alloc(struct vm_amap *, u_int, int,
> > +struct vm_amap_meta **);
> > +static struct vm_amap *amap_alloc1(u_int, u_int, int);
> >  static __inline void amap_list_insert(struct vm_amap *);
> >  static __inline void amap_list_remove(struct vm_amap *);   
> > +static __inline void amap_anon_release(struct vm_anon *);
> > +void amap_fill_slot(struct vm_amap *, u_int);
> > +void amap_normal_clear_slot(struct vm_amap *, u_int);
> > +void amap_clear_slot(struct vm_amap *, u_int);
> > +static __inline void amap_normal_wipe_slot(struct vm_amap *, u_int);
> > +
> > +void amap_wipeout_traverse(struct vm_anon **, u_int, u_int);
> > +int amap_cow_now_traverse(struct vm_anon **, u_int, u_int);
> > +int amap_swap_off_traverse(struct vm_amap *, struct vm_anon **, u_int, 
> > u_int,
> > +int, int);
> >  
> >  static __inline void
> >  amap_list_insert(struct vm_amap *amap)
> > @@ -81,6 +91,99 @@ amap_list_remove(struct vm_amap *amap)
> > LIST_REMOVE(amap, am_list);
> >  }
> >  
> > +static __inline void
> > +amap_anon_release(struct vm_anon *anon)
> > +{
> &

uvm: shrinking amap kmem consumption

2016-03-21 Thread Stefan Kempf
Recently we found that amaps consume a good deal of kernel address space.
See this thread: https://marc.info/?l=openbsd-tech=145752756005014=2.
And we found a way to reduce kernel mem pressure for some architectures
at least. See the diffs in that thread.

Besides that, it's possible to shrink the amap struct from 72 to 48 bytes
on 64 bit systems (or from 44 to 32 bytes on 32 bit systems).

It's also possible to cut down the memory needed for slots roughly in half
on 64 bit architectures, and cut it up to a factor of 3 on 32 bit machines.

Here's how amap slots are maintained currently: for every slot, the kernel
allocates one pointer to a vm_anon and two ints (16 bytes per slot on
64 bit systems, 12 bytes for 32 CPUs).

To reduce these memory requirements, we need three flavors of amaps:

- Tiny amaps with only one slot store the pointer to the vm_anon in the
  amap directly. The two ints are not needed. This was Theo's idea.

- Small amaps with up to 32 slots need 8 instead of 16 bytes per slot
  (or 4 bytes instead of 12 on 32 bit machines).
  It's enough to store the array of anons. The two ints per slot are
  not needed.

  Tiny and small amaps are the ones used most often.

- Normal amaps with n >= 32 slots only need
  4 * sizeof(pointer) + n * sizeof(struct vm_anon *) + 12*n/32 bytes
  to maintain amap slots. For large n that's also around 1.8 times
  less memory for slots (or about 2.7 times less on 32 bit CPUs) compared
  to the current implementation.
  That memory is for the vm_anon array, and a header structure. The two
  ints per slot in the current code are replaced by n/32 bitmaps.

But that also means that the amap layer has to do some special-casing
when dealing with amaps. I tried to factor out the common code where
possible.

How does it work? Basically, the *current* slots in an amap consist
of three arrays (for n slots, every array has size n):
- am_anon: an array of vm_anon
- am_slots: an array of indices that store which entries in am_anon are
  not NULL. This is for quickly traversing all occupied entries in
  the amap
- am_bckptr: maps a non-NULL entry in am_anon to an index in am_slots
  Needed to update am_slots when setting an entry in am_anon to NULL.

We can compress am_slots and am_bckptr to bitmaps of 32 bits each,
so we only need n/32 entries for am_slots and am_bckptr.

For amaps with up to 32 slots, we do not need am_slots and am_bckptr:
it's possible to store the bitmap of used slots in the amap directly.

I see no difference in doing a make -j4 build with this on an amd64
T430s (so things don't seem to get slower here with this). Tests on
other systems and architectures are much appreciated.

The diff below contains a proof of concept, but I'll also post smaller
diffs that are easier to review in the next days.

Comments/thoughts about doing this?

Index: uvm/uvm_amap.c
===
RCS file: /cvs/src/sys/uvm/uvm_amap.c,v
retrieving revision 1.62
diff -u -p -r1.62 uvm_amap.c
--- uvm/uvm_amap.c  16 Mar 2016 16:53:43 -  1.62
+++ uvm/uvm_amap.c  21 Mar 2016 18:53:45 -
@@ -53,21 +53,31 @@
 struct pool uvm_amap_pool;
 
 /* Pools for amap slots for the most common amap slot sizes */
-struct pool uvm_amap_slot_pools[UVM_AMAP_CHUNK];
+struct pool uvm_amap_slot_pools[UVM_AMAP_CHUNK - 1];
 
 LIST_HEAD(, vm_amap) amap_list;
 
-static char amap_slot_pool_names[UVM_AMAP_CHUNK][13];
-
-#define MALLOC_SLOT_UNIT (2 * sizeof(int) + sizeof(struct vm_anon *))
+static char amap_slot_pool_names[UVM_AMAP_CHUNK - 1][13];
 
 /*
  * local functions
  */
 
-static struct vm_amap *amap_alloc1(int, int, int);
+struct vm_anon **amap_slots_alloc(struct vm_amap *, u_int, int,
+struct vm_amap_meta **);
+static struct vm_amap *amap_alloc1(u_int, u_int, int);
 static __inline void amap_list_insert(struct vm_amap *);
 static __inline void amap_list_remove(struct vm_amap *);   
+static __inline void amap_anon_release(struct vm_anon *);
+void amap_fill_slot(struct vm_amap *, u_int);
+void amap_normal_clear_slot(struct vm_amap *, u_int);
+void amap_clear_slot(struct vm_amap *, u_int);
+static __inline void amap_normal_wipe_slot(struct vm_amap *, u_int);
+
+void amap_wipeout_traverse(struct vm_anon **, u_int, u_int);
+int amap_cow_now_traverse(struct vm_anon **, u_int, u_int);
+int amap_swap_off_traverse(struct vm_amap *, struct vm_anon **, u_int, u_int,
+int, int);
 
 static __inline void
 amap_list_insert(struct vm_amap *amap)
@@ -81,6 +91,99 @@ amap_list_remove(struct vm_amap *amap)
LIST_REMOVE(amap, am_list);
 }
 
+static __inline void
+amap_anon_release(struct vm_anon *anon)
+{
+   int refs;
+
+   refs = --anon->an_ref;
+   if (refs == 0) {
+   /* we had the last reference to a vm_anon. free it. */
+   uvm_anfree(anon);
+   }
+}
+
+void
+amap_fill_slot(struct vm_amap *amap, u_int slot)
+{
+   u_int clust, ptr;
+   struct vm_amap_meta *meta;
+   struct 

uvm amap: Simplify amap traversal in amap_swap_off

2016-03-19 Thread Stefan Kempf
The recent uvm commits fixed hangs because machines went out of memory
because of using too much space for amap slots.

It's possible to shrink memory requirements for amaps even more,
but the current code needs some simplifications first. So here's another
cleanup diff. There'll be one or two more before we get to the real stuff.

This one simplifies the traversal of an amap when swap is disabled.

There's no need to insert marker elements to find the next item in the
amap list. The next amap can be determined by looking at the
currently examined amap.

Care must be taken to get the next element before the current amap is
possibly deleted, and after all the current amap's pages were read in
from swap (because the page-in may sleep and remove items from the amap
list).

ok?

Index: uvm/uvm_amap.c
===
RCS file: /cvs/src/sys/uvm/uvm_amap.c,v
retrieving revision 1.62
diff -u -p -r1.62 uvm_amap.c
--- uvm/uvm_amap.c  16 Mar 2016 16:53:43 -  1.62
+++ uvm/uvm_amap.c  17 Mar 2016 18:08:29 -
@@ -902,21 +902,11 @@ amap_swap_off(int startslot, int endslot
 {
struct vm_amap *am;
struct vm_amap *am_next;
-   struct vm_amap marker_prev;
-   struct vm_amap marker_next;
boolean_t rv = FALSE;
 
-#if defined(DIAGNOSTIC)
-   memset(_prev, 0, sizeof(marker_prev));
-   memset(_next, 0, sizeof(marker_next));
-#endif /* defined(DIAGNOSTIC) */
-
for (am = LIST_FIRST(_list); am != NULL && !rv; am = am_next) {
int i;
 
-   LIST_INSERT_BEFORE(am, _prev, am_list);
-   LIST_INSERT_AFTER(am, _next, am_list);
-
for (i = 0; i < am->am_nused; i++) {
int slot;
int swslot;
@@ -935,23 +925,14 @@ amap_swap_off(int startslot, int endslot
rv = uvm_anon_pagein(anon);
 
am->am_flags &= ~AMAP_SWAPOFF;
-   if (amap_refs(am) == 0) {
-   amap_wipeout(am);
-   am = NULL;
+   if (rv || amap_refs(am) == 0)
break;
-   }
-   if (rv) {
-   break;
-   }
i = 0;
}
 
-   KASSERT(LIST_NEXT(_prev, am_list) == _next ||
-   LIST_NEXT(LIST_NEXT(_prev, am_list), am_list) ==
-   _next);
-   am_next = LIST_NEXT(_next, am_list);
-   LIST_REMOVE(_prev, am_list);
-   LIST_REMOVE(_next, am_list);
+   am_next = LIST_NEXT(am, am_list);
+   if (amap_refs(am) == 0)
+   amap_wipeout(am);
}
 
return rv;



uvm amap: Remove redundant check

2016-03-19 Thread Stefan Kempf
The compiler is also smart enough to recognize that this is redundant.
The resulting code on amd64 is basically equivalent (slightly different
register allocation and instruction scheduling).

ok?

Index: uvm/uvm_amap.c
===
RCS file: /cvs/src/sys/uvm/uvm_amap.c,v
retrieving revision 1.61
diff -u -p -r1.61 uvm_amap.c
--- uvm/uvm_amap.c  15 Mar 2016 18:16:21 -  1.61
+++ uvm/uvm_amap.c  16 Mar 2016 16:33:52 -
@@ -917,10 +917,6 @@ amap_swap_off(int startslot, int endslot
LIST_INSERT_BEFORE(am, _prev, am_list);
LIST_INSERT_AFTER(am, _next, am_list);
 
-   if (am->am_nused <= 0) {
-   goto next;
-   }
-
for (i = 0; i < am->am_nused; i++) {
int slot;
int swslot;
@@ -950,7 +946,6 @@ amap_swap_off(int startslot, int endslot
i = 0;
}
 
-next:
KASSERT(LIST_NEXT(_prev, am_list) == _next ||
LIST_NEXT(LIST_NEXT(_prev, am_list), am_list) ==
_next);



Re: hang with processes in fltamap: how can I identify running out of RAM?

2016-03-15 Thread Stefan Kempf
Tobias Ulmer wrote:
> Just wanted to note this diff in combination with your other uvm diff
> does really well on sparc, building ports. Cuts down amap "INUSE" by
> about a factor of 20.
> Will report if anything bad happens.

Cool, thanks for testing as well!

So I plan to commit both diffs if the ports build succeeds and if
there are no objections otherwise.



reducing kmem pressure: make sbrk() allocate amap slots lazily

2016-03-12 Thread Stefan Kempf
When sbrk() allocates a range of virtual memory, it immediately allocates
a vm_amap and an am_slots array inside the amap There's one slot per page
allocated, and a slot is 16 bytes in size (on 64 bit CPUs, 12 on 32 bit
CPUs).

Preallocating slots makes sense mostly when we know that the memory range
is going to be populated by physical pages.

However, when doing sbrk(), physical pages are only allocated when
actually accessing the range allocated. Why should we pre-allocate
amap slots?

Removing the UVM_FLAG_OVERLAY flag turns this into lazy slot allocation
in chunks of 16 slots.

The diff below reduce kmem pressure (and hangs because of it) especially
in combination with the amap pool diff here:
https://marc.info/?l=openbsd-tech=145763420910948=2

Tests that make sure that things don't break with this are welcome.

Index: uvm/uvm_unix.c
===
RCS file: /cvs/src/sys/uvm/uvm_unix.c,v
retrieving revision 1.56
diff -u -p -r1.56 uvm_unix.c
--- uvm/uvm_unix.c  5 May 2015 02:13:46 -   1.56
+++ uvm/uvm_unix.c  12 Mar 2016 10:24:24 -
@@ -87,7 +87,7 @@ sys_obreak(struct proc *p, void *v, regi
UVM_MAPFLAG(PROT_READ | PROT_WRITE,
PROT_READ | PROT_WRITE | PROT_EXEC, MAP_INHERIT_COPY,
MADV_NORMAL, UVM_FLAG_AMAPPAD|UVM_FLAG_FIXED|
-   UVM_FLAG_OVERLAY|UVM_FLAG_COPYONW));
+   UVM_FLAG_COPYONW));
if (error) {
uprintf("sbrk: grow %ld failed, error = %d\n",
new - old, error);



Re: hang with processes in fltamap: how can I identify running out of RAM?

2016-03-12 Thread Stefan Kempf
Stuart Henderson wrote:
> On 2016/03/10 19:18, Stefan Kempf wrote:
> > There's still at least one issue with the diff. Again in amap_extend().
> > The slotalloc computation was still off :-(
> 
> It's not perfect but this is very significantly better. I've put
> it under load and the machine is still ok.

Cool. I have an idea why still processes become unresponsivene. See
the comments to the vmstats outputs.
 
> Here's a snapshot of a top display:
> 
> 46 processes: 1 running, 43 idle, 2 on processor  up  6:27
> CPU0 states:  0.0% user,  0.0% nice, 69.2% system,  0.0% interrupt, 30.8% idle
> CPU1 states:  0.0% user,  0.0% nice, 92.3% system,  0.0% interrupt,  7.7% idle
> Memory: Real: 3211M/5422M act/tot Free: 6540M Cache: 153M Swap: 0K/2640M
> 
>   PID USERNAME PRI NICE  SIZE   RES STATE WAIT  TIMECPU COMMAND
> 61052 sthen-180 2472M 1505M onproc-63:51 54.39% perl
> 86140 sthen-180 2308M 1732M run   -57:39 52.54% perl
> 47388 sthen 280 1284K 2480K onproc- 0:45  1.03% top
> 59819 _pflogd40  684K  260K sleep bpf   0:15  0.73% pflogd
> 89941 sthen  20 3552K 1780K sleep select0:13  0.15% sshd
> 46513 _ntp   2  -20 1240K 1456K sleep poll  0:08  0.05% ntpd
> 89307 root   2  -20  812K 1596K idle  poll  3:44  0.00% ntpd
> 78543 sthen  20 2536K 1784K sleep poll  0:10  0.00% systat
> 21968 sthen  20 3568K 1772K idle  select0:08  0.00% sshd
> 10538 sthen  20 9220K 8400K sleep kqread0:05  0.00% tmux
> 45455 sthen  20 3564K 1716K sleep select0:01  0.00% sshd
> 1 root  100  440K4K idle  wait  0:01  0.00% init
> 74816 sthen  20 3564K 1736K sleep select0:01  0.00% sshd
> 39769 _syslogd   20 1036K 1144K idle  kqread0:00  0.00% syslogd
> 
> At this point the big perl processes were flipping between onproc/run
> and sleep with wchan "fltamap".
> 
> The not-so-good bit: I couldn't kill them by signalling with ^C ^\ or
> kill(1), however I was able to get them to stop by killing the tmux they
> were running in. But without the diff this would have been a machine
> hang for sure.
> 
> Below I include vmstat -m output, firstly from around the point of
> the top(1) run above, secondly after the processes had exited.
> 
> Given the area more eyes are definitely wanted but it reads well to
> me, and I'm happy from my testing that this is a huge improvement,
> so: OK sthen.

Thanks. Agreed, any additional review and testing is useful,
also to ensure it doesn't break stuff.

> I have one nit, "systat pool" normally shows only 11 chars so
> you get "amapslotpl1, amapslotpl1, amapslotpl1, ... amapslotpl2" etc.
> I considered renaming to amapslpl1, 2, .. but that's pretty ugly,
> and since vmstat -m displays 12 chars, I propose this:

I think your diff is ok.
 
> Index: pool.c
> ===
> RCS file: /cvs/src/usr.bin/systat/pool.c,v
> retrieving revision 1.10
> diff -u -p -r1.10 pool.c
> --- pool.c16 Jan 2015 00:03:37 -  1.10
> +++ pool.c11 Mar 2016 21:09:35 -
> @@ -51,7 +51,7 @@ struct pool_info *pools = NULL;
>  
>  
>  field_def fields_pool[] = {
> - {"NAME", 11, 32, 1, FLD_ALIGN_LEFT, -1, 0, 0, 0},
> + {"NAME", 12, 32, 1, FLD_ALIGN_LEFT, -1, 0, 0, 0},
>   {"SIZE", 8, 24, 1, FLD_ALIGN_RIGHT, -1, 0, 0, 0},
>   {"REQUESTS", 8, 24, 1, FLD_ALIGN_RIGHT, -1, 0, 0, 0},
>   {"FAIL", 8, 24, 1, FLD_ALIGN_RIGHT, -1, 0, 0, 0},
> 
> 
> |... during run
> 
> <sthen@stamp1:~:295>$ vmstat -m
> Memory statistics by bucket size
> Size   In Use   Free   Requests  HighWater  Couldfree
>   16 2498318   88821280 12
>   32 1038114   4266 640  0
>   6452460 20 114795 320   4544
>  128   129653 43 508458 160  68133
>  256  132 28   3161  80  0
>  512   120414  2 813289  40  40368
> 1024  307 25  60779  20  23820
> 2048  232 58   1439  10284
> 4096  122  5 110326   5  45531
> 8192  206  0242   5  0
>163844  0  5   5  0
>327689  0 10   5  0
>655363  0511   5  0
>   1310722  0  

Re: hang with processes in fltamap: how can I identify running out of RAM?

2016-03-10 Thread Stefan Kempf
Stefan Kempf wrote:
> Stuart Henderson wrote:
> > On 2016/03/09 20:49, Stefan Kempf wrote:
> > > Here's a diff that allocates the most commonly used amap slot sizes
> > > (between 1 and 16) using pool_get(9) instead of malloc(9). That should
> > > reduce the pressure on kernel virtual address space somewhat (on amd64
> > > at least),
> > 
> > Thanks for the useful information and diff.
> > 
> > I'm running a full ports build on i386 now with this in the kernel,
> > and will run some more of my memory-hungry jobs on amd64 tomorrow.
>  
> I looked at the diff again and unfortunately there is a bug in the
> first version. slotalloc was computed incorrectly in amap_extend()
> when using pool(9).
> 
> That code path seems to not be executed often though
> and can only be triggered with certain allocation sizes (maybe the
> i386 ports build survives even with the first diff).
> 
> But please try the one below on amd64 if possible.

There's still at least one issue with the diff. Again in amap_extend().
The slotalloc computation was still off :-(

Index: uvm/uvm_amap.c
===
RCS file: /cvs/src/sys/uvm/uvm_amap.c,v
retrieving revision 1.60
diff -u -p -r1.60 uvm_amap.c
--- uvm/uvm_amap.c  6 Mar 2016 14:47:07 -   1.60
+++ uvm/uvm_amap.c  10 Mar 2016 18:15:17 -
@@ -52,8 +52,13 @@
 
 struct pool uvm_amap_pool;
 
+/* Pools for amap slots for the most common amap slot sizes */
+struct pool uvm_amap_slot_pools[UVM_AMAP_CHUNK];
+
 LIST_HEAD(, vm_amap) amap_list;
 
+static char amap_slot_pool_names[UVM_AMAP_CHUNK][13];
+
 #define MALLOC_SLOT_UNIT (2 * sizeof(int) + sizeof(struct vm_anon *))
 
 /*
@@ -151,10 +156,20 @@ pp_setreflen(int *ppref, int offset, int
 void
 amap_init(void)
 {
+   int i;
+
/* Initialize the vm_amap pool. */
pool_init(_amap_pool, sizeof(struct vm_amap), 0, 0, PR_WAITOK,
"amappl", NULL);
pool_sethiwat(_amap_pool, 4096);
+
+   for (i = 0; i < nitems(uvm_amap_slot_pools); i++) {
+   snprintf(amap_slot_pool_names[i],
+   sizeof(amap_slot_pool_names[0]), "amapslotpl%d", i + 1);
+   pool_init(_amap_slot_pools[i], (i + 1) * MALLOC_SLOT_UNIT,
+   0, 0, PR_WAITOK, amap_slot_pool_names[i], NULL);
+   pool_sethiwat(_amap_slot_pools[i], 4096);
+   }
 }
 
 /*
@@ -172,8 +187,13 @@ amap_alloc1(int slots, int padslots, int
if (amap == NULL)
return(NULL);
 
-   totalslots = malloc_roundup((slots + padslots) * MALLOC_SLOT_UNIT) /
-   MALLOC_SLOT_UNIT;
+   totalslots = slots + padslots;
+   KASSERT(totalslots > 0);
+
+   if (totalslots > UVM_AMAP_CHUNK)
+   totalslots = malloc_roundup(totalslots * MALLOC_SLOT_UNIT) /
+   MALLOC_SLOT_UNIT;
+
amap->am_ref = 1;
amap->am_flags = 0;
 #ifdef UVM_AMAP_PPREF
@@ -183,8 +203,14 @@ amap_alloc1(int slots, int padslots, int
amap->am_nslot = slots;
amap->am_nused = 0;
 
-   amap->am_slots = malloc(totalslots * MALLOC_SLOT_UNIT, M_UVMAMAP,
-   waitf);
+   if (totalslots > UVM_AMAP_CHUNK)
+   amap->am_slots = malloc(totalslots * MALLOC_SLOT_UNIT,
+   M_UVMAMAP, waitf);
+   else
+   amap->am_slots = pool_get(
+   _amap_slot_pools[totalslots - 1],
+   (waitf == M_WAITOK) ? PR_WAITOK : PR_NOWAIT);
+
if (amap->am_slots == NULL)
goto fail1;
 
@@ -238,7 +264,12 @@ amap_free(struct vm_amap *amap)
KASSERT(amap->am_ref == 0 && amap->am_nused == 0);
KASSERT((amap->am_flags & AMAP_SWAPOFF) == 0);
 
-   free(amap->am_slots, M_UVMAMAP, 0);
+   if (amap->am_maxslot > UVM_AMAP_CHUNK)
+   free(amap->am_slots, M_UVMAMAP, 0);
+   else
+   pool_put(_amap_slot_pools[amap->am_maxslot - 1],
+   amap->am_slots);
+
 #ifdef UVM_AMAP_PPREF
if (amap->am_ppref && amap->am_ppref != PPREF_NONE)
free(amap->am_ppref, M_UVMAMAP, 0);
@@ -324,8 +355,12 @@ amap_extend(struct vm_map_entry *entry, 
if (slotneed >= UVM_AMAP_LARGE)
return E2BIG;
 
-   slotalloc = malloc_roundup(slotneed * MALLOC_SLOT_UNIT) /
-   MALLOC_SLOT_UNIT;
+   if (slotneed > UVM_AMAP_CHUNK)
+   slotalloc = malloc_roundup(slotneed * MALLOC_SLOT_UNIT) /
+   MALLOC_SLOT_UNIT;
+   else
+   slotalloc = slotneed;
+
 #ifdef UVM_AMAP_PPREF
newppref = NULL;
if (amap->am_ppref && amap->am_ppref != PPREF_NONE) {
@@ -338,8 +373,12 @@ amap_extend(struct vm_map_entry *entry, 
}
}
 #endif
-   newsl

Re: hang with processes in fltamap: how can I identify running out of RAM?

2016-03-09 Thread Stefan Kempf
Stuart Henderson wrote:
> On 2016/03/09 20:49, Stefan Kempf wrote:
> > Here's a diff that allocates the most commonly used amap slot sizes
> > (between 1 and 16) using pool_get(9) instead of malloc(9). That should
> > reduce the pressure on kernel virtual address space somewhat (on amd64
> > at least),
> 
> Thanks for the useful information and diff.
> 
> I'm running a full ports build on i386 now with this in the kernel,
> and will run some more of my memory-hungry jobs on amd64 tomorrow.
 
I looked at the diff again and unfortunately there is a bug in the
first version. slotalloc was computed incorrectly in amap_extend()
when using pool(9).

That code path seems to not be executed often though
and can only be triggered with certain allocation sizes (maybe the
i386 ports build survives even with the first diff).

But please try the one below on amd64 if possible.

Index: uvm/uvm_amap.c
===
RCS file: /cvs/src/sys/uvm/uvm_amap.c,v
retrieving revision 1.60
diff -u -p -r1.60 uvm_amap.c
--- uvm/uvm_amap.c  6 Mar 2016 14:47:07 -   1.60
+++ uvm/uvm_amap.c  10 Mar 2016 05:19:09 -
@@ -52,8 +52,13 @@
 
 struct pool uvm_amap_pool;
 
+/* Pools for amap slots for the most common amap slot sizes */
+struct pool uvm_amap_slot_pools[UVM_AMAP_CHUNK];
+
 LIST_HEAD(, vm_amap) amap_list;
 
+static char amap_slot_pool_names[UVM_AMAP_CHUNK][13];
+
 #define MALLOC_SLOT_UNIT (2 * sizeof(int) + sizeof(struct vm_anon *))
 
 /*
@@ -151,10 +156,20 @@ pp_setreflen(int *ppref, int offset, int
 void
 amap_init(void)
 {
+   int i;
+
/* Initialize the vm_amap pool. */
pool_init(_amap_pool, sizeof(struct vm_amap), 0, 0, PR_WAITOK,
"amappl", NULL);
pool_sethiwat(_amap_pool, 4096);
+
+   for (i = 0; i < nitems(uvm_amap_slot_pools); i++) {
+   snprintf(amap_slot_pool_names[i],
+   sizeof(amap_slot_pool_names[0]), "amapslotpl%d", i + 1);
+   pool_init(_amap_slot_pools[i], (i + 1) * MALLOC_SLOT_UNIT,
+   0, 0, PR_WAITOK, amap_slot_pool_names[i], NULL);
+   pool_sethiwat(_amap_slot_pools[i], 4096);
+   }
 }
 
 /*
@@ -172,8 +187,13 @@ amap_alloc1(int slots, int padslots, int
if (amap == NULL)
return(NULL);
 
-   totalslots = malloc_roundup((slots + padslots) * MALLOC_SLOT_UNIT) /
-   MALLOC_SLOT_UNIT;
+   totalslots = slots + padslots;
+   KASSERT(totalslots > 0);
+
+   if (totalslots > UVM_AMAP_CHUNK)
+   totalslots = malloc_roundup(
+   (slots + padslots) * MALLOC_SLOT_UNIT) / MALLOC_SLOT_UNIT;
+
amap->am_ref = 1;
amap->am_flags = 0;
 #ifdef UVM_AMAP_PPREF
@@ -183,8 +203,14 @@ amap_alloc1(int slots, int padslots, int
amap->am_nslot = slots;
amap->am_nused = 0;
 
-   amap->am_slots = malloc(totalslots * MALLOC_SLOT_UNIT, M_UVMAMAP,
-   waitf);
+   if (totalslots > UVM_AMAP_CHUNK)
+   amap->am_slots = malloc(totalslots * MALLOC_SLOT_UNIT,
+   M_UVMAMAP, waitf);
+   else
+   amap->am_slots = pool_get(
+   _amap_slot_pools[totalslots - 1],
+   (waitf == M_WAITOK) ? PR_WAITOK : PR_NOWAIT);
+
if (amap->am_slots == NULL)
goto fail1;
 
@@ -238,7 +264,12 @@ amap_free(struct vm_amap *amap)
KASSERT(amap->am_ref == 0 && amap->am_nused == 0);
KASSERT((amap->am_flags & AMAP_SWAPOFF) == 0);
 
-   free(amap->am_slots, M_UVMAMAP, 0);
+   if (amap->am_maxslot > UVM_AMAP_CHUNK)
+   free(amap->am_slots, M_UVMAMAP, 0);
+   else
+   pool_put(_amap_slot_pools[amap->am_maxslot - 1],
+   amap->am_slots);
+
 #ifdef UVM_AMAP_PPREF
if (amap->am_ppref && amap->am_ppref != PPREF_NONE)
free(amap->am_ppref, M_UVMAMAP, 0);
@@ -324,8 +355,7 @@ amap_extend(struct vm_map_entry *entry, 
if (slotneed >= UVM_AMAP_LARGE)
return E2BIG;
 
-   slotalloc = malloc_roundup(slotneed * MALLOC_SLOT_UNIT) /
-   MALLOC_SLOT_UNIT;
+   slotalloc = slotneed;
 #ifdef UVM_AMAP_PPREF
newppref = NULL;
if (amap->am_ppref && amap->am_ppref != PPREF_NONE) {
@@ -338,8 +368,14 @@ amap_extend(struct vm_map_entry *entry, 
}
}
 #endif
-   newsl = malloc(slotalloc * MALLOC_SLOT_UNIT, M_UVMAMAP,
-   M_WAITOK | M_CANFAIL);
+   if (slotneed > UVM_AMAP_CHUNK) {
+   slotalloc = malloc_roundup(slotneed * MALLOC_SLOT_UNIT) /
+   MALLOC_SLOT_UNIT;
+   newsl = malloc(slotalloc * MALLOC_SLOT_UNIT, M_UVMAMAP,
+   M_WAITOK | M_CANFAIL);
+   } else
+   newsl = pool_get(_amap_s

Re: hang with processes in fltamap: how can I identify running out of RAM?

2016-03-09 Thread Stefan Kempf
Stuart Henderson wrote:
> While running some fairly memory-hungry jobs I hit a state where wchan
> in top was showing "fltamap" and the machine locked up (couldn't enter
> DDB).
> 
> Which must be this:
> 
>   /* didn't work?  must be out of RAM.  sleep. */
>   if (UVM_ET_ISNEEDSCOPY(ufi->entry)) {
>   uvmfault_unlockmaps(ufi, TRUE);
>   uvm_wait("fltamapcopy");
>   continue;
>   }

Correct. You still have plenty of RAM free, but the kernel cannot
allocate an amap and amap slots to maintain it because it runs
out of kernel virtual address space.

Here's a diff that allocates the most commonly used amap slot sizes
(between 1 and 16) using pool_get(9) instead of malloc(9). That should
reduce the pressure on kernel virtual address space somewhat (on amd64
at least),

As discussed with Theo, there's more optimization potential
for amaps with up to 4 slots: the slots can be stored in the amap
itself. This is still brewing though, and will later be added on top
of the diff below.

First testing and review is still appreciated to know we have the
basics right before doing more optimizations.

Index: uvm/uvm_amap.c
===
RCS file: /cvs/src/sys/uvm/uvm_amap.c,v
retrieving revision 1.60
diff -u -p -r1.60 uvm_amap.c
--- uvm/uvm_amap.c  6 Mar 2016 14:47:07 -   1.60
+++ uvm/uvm_amap.c  9 Mar 2016 19:37:55 -
@@ -52,8 +52,13 @@
 
 struct pool uvm_amap_pool;
 
+/* Pools for amap slots for the most common amap slot sizes */
+struct pool uvm_amap_slot_pools[UVM_AMAP_CHUNK];
+
 LIST_HEAD(, vm_amap) amap_list;
 
+static char amap_slot_pool_names[UVM_AMAP_CHUNK][13];
+
 #define MALLOC_SLOT_UNIT (2 * sizeof(int) + sizeof(struct vm_anon *))
 
 /*
@@ -151,10 +156,20 @@ pp_setreflen(int *ppref, int offset, int
 void
 amap_init(void)
 {
+   int i;
+
/* Initialize the vm_amap pool. */
pool_init(_amap_pool, sizeof(struct vm_amap), 0, 0, PR_WAITOK,
"amappl", NULL);
pool_sethiwat(_amap_pool, 4096);
+
+   for (i = 0; i < nitems(uvm_amap_slot_pools); i++) {
+   snprintf(amap_slot_pool_names[i],
+   sizeof(amap_slot_pool_names[0]), "amapslotpl%d", i + 1);
+   pool_init(_amap_slot_pools[i], (i + 1) * MALLOC_SLOT_UNIT,
+   0, 0, PR_WAITOK, amap_slot_pool_names[i], NULL);
+   pool_sethiwat(_amap_slot_pools[i], 4096);
+   }
 }
 
 /*
@@ -172,8 +187,13 @@ amap_alloc1(int slots, int padslots, int
if (amap == NULL)
return(NULL);
 
-   totalslots = malloc_roundup((slots + padslots) * MALLOC_SLOT_UNIT) /
-   MALLOC_SLOT_UNIT;
+   totalslots = slots + padslots;
+   KASSERT(totalslots > 0);
+
+   if (totalslots > UVM_AMAP_CHUNK)
+   totalslots = malloc_roundup(
+   (slots + padslots) * MALLOC_SLOT_UNIT) / MALLOC_SLOT_UNIT;
+
amap->am_ref = 1;
amap->am_flags = 0;
 #ifdef UVM_AMAP_PPREF
@@ -183,8 +203,14 @@ amap_alloc1(int slots, int padslots, int
amap->am_nslot = slots;
amap->am_nused = 0;
 
-   amap->am_slots = malloc(totalslots * MALLOC_SLOT_UNIT, M_UVMAMAP,
-   waitf);
+   if (totalslots > UVM_AMAP_CHUNK)
+   amap->am_slots = malloc(totalslots * MALLOC_SLOT_UNIT,
+   M_UVMAMAP, waitf);
+   else
+   amap->am_slots = pool_get(
+   _amap_slot_pools[totalslots - 1],
+   (waitf == M_WAITOK) ? PR_WAITOK : PR_NOWAIT);
+
if (amap->am_slots == NULL)
goto fail1;
 
@@ -238,7 +264,12 @@ amap_free(struct vm_amap *amap)
KASSERT(amap->am_ref == 0 && amap->am_nused == 0);
KASSERT((amap->am_flags & AMAP_SWAPOFF) == 0);
 
-   free(amap->am_slots, M_UVMAMAP, 0);
+   if (amap->am_maxslot > UVM_AMAP_CHUNK)
+   free(amap->am_slots, M_UVMAMAP, 0);
+   else
+   pool_put(_amap_slot_pools[amap->am_maxslot - 1],
+   amap->am_slots);
+
 #ifdef UVM_AMAP_PPREF
if (amap->am_ppref && amap->am_ppref != PPREF_NONE)
free(amap->am_ppref, M_UVMAMAP, 0);
@@ -338,8 +369,12 @@ amap_extend(struct vm_map_entry *entry, 
}
}
 #endif
-   newsl = malloc(slotalloc * MALLOC_SLOT_UNIT, M_UVMAMAP,
-   M_WAITOK | M_CANFAIL);
+   if (slotneed > UVM_AMAP_CHUNK)
+   newsl = malloc(slotalloc * MALLOC_SLOT_UNIT, M_UVMAMAP,
+   M_WAITOK | M_CANFAIL);
+   else
+   newsl = pool_get(_amap_slot_pools[slotneed - 1],
+   PR_WAITOK | PR_LIMITFAIL);
if (newsl == NULL) {
 #ifdef UVM_AMAP_PPREF
if (newppref != NULL) {
@@ -389,12 +424,17 @@ amap_extend(struct vm_map_entry *entry, 
}
 #endif
 
-   /* update master values */
+   /* free */
+   if (amap->am_maxslot > 

Remove unused amap_share_protect

2016-03-05 Thread Stefan Kempf
See subject.

ok?

diff --git a/uvm/uvm_amap.c b/uvm/uvm_amap.c
index ef8e505..3cc7ed1 100644
--- a/uvm/uvm_amap.c
+++ b/uvm/uvm_amap.c
@@ -403,49 +403,6 @@ amap_extend(struct vm_map_entry *entry, vsize_t addsize)
 }
 
 /*
- * amap_share_protect: change protection of anons in a shared amap
- *
- * for shared amaps, given the current data structure layout, it is
- * not possible for us to directly locate all maps referencing the
- * shared anon (to change the protection).  in order to protect data
- * in shared maps we use pmap_page_protect().  [this is useful for IPC
- * mechanisms like map entry passing that may want to write-protect
- * all mappings of a shared amap.]  we traverse am_anon or am_slots
- * depending on the current state of the amap.
- */
-void
-amap_share_protect(struct vm_map_entry *entry, vm_prot_t prot)
-{
-   struct vm_amap *amap = entry->aref.ar_amap;
-   int slots, lcv, slot, stop;
-
-   AMAP_B2SLOT(slots, (entry->end - entry->start));
-   stop = entry->aref.ar_pageoff + slots;
-
-   if (slots < amap->am_nused) {
-   /* cheaper to traverse am_anon */
-   for (lcv = entry->aref.ar_pageoff ; lcv < stop ; lcv++) {
-   if (amap->am_anon[lcv] == NULL)
-   continue;
-   if (amap->am_anon[lcv]->an_page != NULL)
-   pmap_page_protect(amap->am_anon[lcv]->an_page,
- prot);
-   }
-   return;
-   }
-
-   /* cheaper to traverse am_slots */
-   for (lcv = 0 ; lcv < amap->am_nused ; lcv++) {
-   slot = amap->am_slots[lcv];
-   if (slot < entry->aref.ar_pageoff || slot >= stop)
-   continue;
-   if (amap->am_anon[slot]->an_page != NULL)
-   pmap_page_protect(amap->am_anon[slot]->an_page, prot);
-   }
-   return;
-}
-
-/*
  * amap_wipeout: wipeout all anon's in an amap; then free the amap!
  *
  * => called from amap_unref when the final reference to an amap is
diff --git a/uvm/uvm_amap.h b/uvm/uvm_amap.h
index e10e735..adcc1da 100644
--- a/uvm/uvm_amap.h
+++ b/uvm/uvm_amap.h
@@ -88,8 +88,6 @@ void  amap_lookups(struct vm_aref *, vaddr_t, struct 
vm_anon **, int);
 void   amap_ref(struct vm_amap *, vaddr_t, vsize_t, int);
/* get number of references of amap */
 intamap_refs(struct vm_amap *);
-   /* protect pages in a shared amap */
-void   amap_share_protect(vm_map_entry_t, vm_prot_t);
/* split reference to amap into two */
 void   amap_splitref(struct vm_aref *, struct vm_aref *, vaddr_t);
/* remove an anon from an amap */



Re: remove VOP_UNLOCK() flags

2016-03-05 Thread Stefan Kempf
Martin Natano wrote:
> The VOP_UNLOCK() function has a flags parameter, which is documented to
> should be zero in most cases. It turns out that the flags argument is 
> zero for all ~200 callers in the tree. On a closer look it is revealed
> that VOP_UNLOCK() uses lockmgr() for all filesystems that support
> locking. The only lockmgr() flag that is useful for unlocking is
> LK_RELEASE, which is set by the implementation anyways, so passing flags
> to VOP_UNLOCK() doesn't make sense anyway.
> 
> Ok to remove the flags argument from VOP_UNLOCK()?

Makes sense to me. NetBSD does not have the flags argument for VOP_UNLOCK
either.
 
> natano
> 
> 
> Index: share/man/man9/VOP_LOOKUP.9
> ===
> RCS file: /cvs/src/share/man/man9/VOP_LOOKUP.9,v
> retrieving revision 1.32
> diff -u -p -r1.32 VOP_LOOKUP.9
> --- share/man/man9/VOP_LOOKUP.9   2 Dec 2015 11:03:40 -   1.32
> +++ share/man/man9/VOP_LOOKUP.9   2 Mar 2016 20:02:46 -
> @@ -283,7 +283,6 @@
>  .Ft int
>  .Fo VOP_UNLOCK
>  .Fa "struct vnode *vp"
> -.Fa "int flags"
>  .Fa "struct proc *p"
>  .Fc
>  .Ft int
> @@ -564,7 +563,7 @@ returned.
>  .Pp
>  .It Fn VOP_ISLOCKED vp
>  .It Fn VOP_LOCK vp flags p
> -.It Fn VOP_UNLOCK vp flags p
> +.It Fn VOP_UNLOCK vp p
>  .Fn VOP_LOCK
>  is used internally by
>  .Xr vn_lock 9
> @@ -572,8 +571,6 @@ to lock a vnode.
>  It should not be used by other file system code.
>  .Fn VOP_UNLOCK
>  unlocks a vnode.
> -.Fa flags
> -should be zero in most cases.
>  .Fn VOP_ISLOCKED
>  returns 1 if
>  .Fa vp
> Index: share/man/man9/vinvalbuf.9
> ===
> RCS file: /cvs/src/share/man/man9/vinvalbuf.9,v
> retrieving revision 1.9
> diff -u -p -r1.9 vinvalbuf.9
> --- share/man/man9/vinvalbuf.917 Jul 2013 20:21:56 -  1.9
> +++ share/man/man9/vinvalbuf.92 Mar 2016 20:02:46 -
> @@ -87,7 +87,7 @@ A value of 0 is returned on success.
>  .Bd -literal -offset indent
>  vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, p);
>  error = vinvalbuf(devvp, V_SAVE, cred, p, 0, 0);
> -VOP_UNLOCK(devvp, 0, p);
> +VOP_UNLOCK(devvp, p);
>  if (error)
>   return (error);
>  .Ed
> Index: sys/arch/sparc64/dev/vdsp.c
> ===
> RCS file: /cvs/src/sys/arch/sparc64/dev/vdsp.c,v
> retrieving revision 1.41
> diff -u -p -r1.41 vdsp.c
> --- sys/arch/sparc64/dev/vdsp.c   29 Dec 2015 04:46:28 -  1.41
> +++ sys/arch/sparc64/dev/vdsp.c   2 Mar 2016 20:02:49 -
> @@ -941,7 +941,7 @@ vdsp_open(void *arg1)
>   sc->sc_vdisk_size = va.va_size / DEV_BSIZE;
>   }
>  
> - VOP_UNLOCK(nd.ni_vp, 0, p);
> + VOP_UNLOCK(nd.ni_vp, p);
>   sc->sc_vp = nd.ni_vp;
>  
>   vdsp_readlabel(sc);
> @@ -1013,7 +1013,7 @@ vdsp_readlabel(struct vdsp_softc *sc)
>  
>   vn_lock(sc->sc_vp, LK_EXCLUSIVE | LK_RETRY, p);
>   err = VOP_READ(sc->sc_vp, , 0, p->p_ucred);
> - VOP_UNLOCK(sc->sc_vp, 0, p);
> + VOP_UNLOCK(sc->sc_vp, p);
>   if (err) {
>   free(sc->sc_label, M_DEVBUF, 0);
>   sc->sc_label = NULL;
> @@ -1043,7 +1043,7 @@ vdsp_writelabel(struct vdsp_softc *sc)
>  
>   vn_lock(sc->sc_vp, LK_EXCLUSIVE | LK_RETRY, p);
>   err = VOP_WRITE(sc->sc_vp, , 0, p->p_ucred);
> - VOP_UNLOCK(sc->sc_vp, 0, p);
> + VOP_UNLOCK(sc->sc_vp, p);
>  
>   return (err);
>  }
> @@ -1074,7 +1074,7 @@ vdsp_is_iso(struct vdsp_softc *sc)
>  
>   vn_lock(sc->sc_vp, LK_EXCLUSIVE | LK_RETRY, p);
>   err = VOP_READ(sc->sc_vp, , 0, p->p_ucred);
> - VOP_UNLOCK(sc->sc_vp, 0, p);
> + VOP_UNLOCK(sc->sc_vp, p);
>  
>   if (err == 0 && memcmp(vdp->id, ISO_STANDARD_ID, sizeof(vdp->id)))
>   err = ENOENT;
> @@ -1153,7 +1153,7 @@ vdsp_read_desc(struct vdsp_softc *sc, st
>  
>   vn_lock(sc->sc_vp, LK_EXCLUSIVE | LK_RETRY, p);
>   dm->status = VOP_READ(sc->sc_vp, , 0, p->p_ucred);
> - VOP_UNLOCK(sc->sc_vp, 0, p);
> + VOP_UNLOCK(sc->sc_vp, p);
>  
>   KERNEL_UNLOCK();
>   if (dm->status == 0) {
> @@ -1227,7 +1227,7 @@ vdsp_read_dring(void *arg1, void *arg2)
>  
>   vn_lock(sc->sc_vp, LK_EXCLUSIVE | LK_RETRY, p);
>   vd->status = VOP_READ(sc->sc_vp, , 0, p->p_ucred);
> - VOP_UNLOCK(sc->sc_vp, 0, p);
> + VOP_UNLOCK(sc->sc_vp, p);
>  
>   KERNEL_UNLOCK();
>   if (vd->status == 0) {
> @@ -1326,7 +1326,7 @@ vdsp_write_dring(void *arg1, void *arg2)
>  
>   vn_lock(sc->sc_vp, LK_EXCLUSIVE | LK_RETRY, p);
>   vd->status = VOP_WRITE(sc->sc_vp, , 0, p->p_ucred);
> - VOP_UNLOCK(sc->sc_vp, 0, p);
> + VOP_UNLOCK(sc->sc_vp, p);
>  
>  fail:
>   free(buf, M_DEVBUF, 0);
> Index: sys/dev/diskmap.c
> ===
> RCS file: /cvs/src/sys/dev/diskmap.c,v
> retrieving revision 1.13
> diff -u -p -r1.13 diskmap.c
> --- 

Re: read(2) shouldn't return EFBIG

2016-02-29 Thread Stefan Kempf
Martin Natano wrote:
> On Sun, Feb 28, 2016 at 04:40:21PM +0100, Stefan Kempf wrote:
> > Stefan Kempf wrote:
> > > Martin Pieuchot wrote:
> > > > I'm also wondering when you say "an offset that's at or paste the
> > > > EOF" does that include ``uio_resid''?  I mean shouldn't you check
> > > > for:
> > > > 
> > > > if ((uio->uio_offset + uio->uio_resid) > fs->fs_maxfilesize)
> > > > ...
> > > 
> > > I think if uio->uio_offset < fs->fs_maxfilesize, then
> > > min(uio->uio_resid, fs->fs_maxfilesize - uio->uio_offset)
> > > bytes should be read.
> > 
> > Ehh. Not quite true. It can't read more than the size of the file.
> 
> Yes, correct. uio_resid is not included, because instead of returning
> zero, the function should transfer the available bytes (even if less
> than has been requested).
> 
> 
> > The read-loops of ffs_read and ext2_ind_read already check
> > whether uio_offset is beyond the size of the file:
> > 
> > for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
> > if ((bytesinfile = DIP(ip, size) - uio->uio_offset) <= 0)
> > break;
> > 
> > If we are assured that DIP(ip, size) cannot return a bogus size
> > (> fs->fs_maxfilesize), the check before the loop could be
> > 
> > if (uio->uio_offset >= DIP(ip, size) || uio->uio_resid == 0)
> > return (0);
> >  
> > > Maybe the check should be if (uio_offset >= fs->fs_maxfilesize)
> > > though. Otherwise access flags of the inode are changed at the end
> > > if ext2_ind_read() and ffs_read() even though there was no real access.
> 
> Your comment got me investigate what the supposed behaviour is. It turns
> out POSIX is pretty clear about the intended behaviour: "Upon successful
> completion, where nbyte is greater than 0, read() will mark for update
> the st_atime field of the file, and return the number of bytes read."
> Note, that nbytes is the requested read size, not the actual read size,
> so even if the actual read is zero bytes, atime has to be updated when
> more has been requested.
> 
> I think what this means is, that we shouldn't check for (uio_offset >
> filesize) before the loop at all and let the respecitve read loop do
> it's magic. All the involved loops seem to handle this case correctly
> already.

Thanks for checking. Looks good.
 
> Does that make sense?
> 
> natano
> 
> 
> Index: ufs/ext2fs/ext2fs_readwrite.c
> ===
> RCS file: /cvs/src/sys/ufs/ext2fs/ext2fs_readwrite.c,v
> retrieving revision 1.39
> diff -u -p -r1.39 ext2fs_readwrite.c
> --- ufs/ext2fs/ext2fs_readwrite.c 27 Feb 2016 18:50:38 -  1.39
> +++ ufs/ext2fs/ext2fs_readwrite.c 28 Feb 2016 17:57:08 -
> @@ -100,8 +100,8 @@ ext2_ind_read(struct vnode *vp, struct i
>   } else if (vp->v_type != VREG && vp->v_type != VDIR)
>   panic("%s: type %d", "ext2fs_read", vp->v_type);
>  #endif
> - if (e2fs_overflow(fs, 0, uio->uio_offset))
> - return (EFBIG);
> + if (uio->uio_offset < 0)
> + return (EINVAL);
>   if (uio->uio_resid == 0)
>   return (0);
>  
> @@ -163,7 +163,6 @@ ext4_ext_read(struct vnode *vp, struct i
>   struct ext4_extent_path path;
>   struct ext4_extent nex, *ep;
>   struct buf *bp;
> - size_t orig_resid;
>   daddr_t lbn, pos;
>   off_t bytesinfile;
>   int size, xfersize, blkoffset;
> @@ -171,12 +170,10 @@ ext4_ext_read(struct vnode *vp, struct i
>  
>   memset(, 0, sizeof path);
>  
> - orig_resid = uio->uio_resid;
> - if (orig_resid == 0)
> + if (uio->uio_offset < 0)
> + return (EINVAL);
> + if (uio->uio_resid == 0)
>   return (0);
> -
> - if (e2fs_overflow(fs, 0, uio->uio_offset))
> - return (EFBIG);
>  
>   while (uio->uio_resid > 0) {
>   if ((bytesinfile = ext2fs_size(ip) - uio->uio_offset) <= 0)
> Index: ufs/ffs/ffs_vnops.c
> ===
> RCS file: /cvs/src/sys/ufs/ffs/ffs_vnops.c,v
> retrieving revision 1.84
> diff -u -p -r1.84 ffs_vnops.c
> --- ufs/ffs/ffs_vnops.c   27 Feb 2016 18:50:38 -  1.84
> +++ ufs/ffs/ffs_vnops.c   28 Feb 2016 17:57:08 -
> @@ -214,9 +214,8 @@ ffs_read(void *v)
>   panic("ffs_read: type %d", vp->

Switch to uiomove in usb

2016-02-28 Thread Stefan Kempf
This changes uiomovei calls to uiomove in usb. It fixes
a few integer truncations due to use of min, and uses
unsigned types for count variables where it makes sense.
This also allows us to get rid of a couple of 'if (len < 0)' checks
that just cannot happen.

udbd_get_cdesc() returns a length value by a pointer. This diff changes
the base type to u_int as well, as all callers pass in unsigned variables
now.

ok?

Index: dev/usb//ugen.c
===
RCS file: /cvs/src/sys/dev/usb/ugen.c,v
retrieving revision 1.91
diff -u -p -r1.91 ugen.c
--- dev/usb//ugen.c 19 Oct 2015 14:05:01 -  1.91
+++ dev/usb//ugen.c 28 Feb 2016 19:10:07 -
@@ -468,7 +468,8 @@ int
 ugen_do_read(struct ugen_softc *sc, int endpt, struct uio *uio, int flag)
 {
struct ugen_endpoint *sce = >sc_endpoints[endpt][IN];
-   u_int32_t n, tn;
+   u_int32_t tn;
+   size_t n;
char buf[UGEN_BBSIZE];
struct usbd_xfer *xfer;
usbd_status err;
@@ -523,16 +524,16 @@ ugen_do_read(struct ugen_softc *sc, int 
 
/* Transfer as many chunks as possible. */
while (sce->q.c_cc > 0 && uio->uio_resid > 0 && !error) {
-   n = min(sce->q.c_cc, uio->uio_resid);
+   n = ulmin(sce->q.c_cc, uio->uio_resid);
if (n > sizeof(buffer))
n = sizeof(buffer);
 
/* Remove a small chunk from the input queue. */
q_to_b(>q, buffer, n);
-   DPRINTFN(5, ("ugenread: got %d chars\n", n));
+   DPRINTFN(5, ("ugenread: got %zu chars\n", n));
 
/* Copy the data to the user process. */
-   error = uiomovei(buffer, n, uio);
+   error = uiomove(buffer, n, uio);
if (error)
break;
}
@@ -546,8 +547,8 @@ ugen_do_read(struct ugen_softc *sc, int 
flags |= USBD_SHORT_XFER_OK;
if (sce->timeout == 0)
flags |= USBD_CATCH;
-   while ((n = min(UGEN_BBSIZE, uio->uio_resid)) != 0) {
-   DPRINTFN(1, ("ugenread: start transfer %d bytes\n",n));
+   while ((n = ulmin(UGEN_BBSIZE, uio->uio_resid)) != 0) {
+   DPRINTFN(1, ("ugenread: start transfer %zu bytes\n",n));
usbd_setup_xfer(xfer, sce->pipeh, 0, buf, n,
flags, sce->timeout, NULL);
err = usbd_transfer(xfer);
@@ -562,8 +563,8 @@ ugen_do_read(struct ugen_softc *sc, int 
break;
}
usbd_get_xfer_status(xfer, NULL, NULL, , NULL);
-   DPRINTFN(1, ("ugenread: got %d bytes\n", tn));
-   error = uiomovei(buf, tn, uio);
+   DPRINTFN(1, ("ugenread: got %u bytes\n", tn));
+   error = uiomove(buf, tn, uio);
if (error || tn < n)
break;
}
@@ -594,14 +595,14 @@ ugen_do_read(struct ugen_softc *sc, int 
 
while (sce->cur != sce->fill && uio->uio_resid > 0 && !error) {
if(sce->fill > sce->cur)
-   n = min(sce->fill - sce->cur, uio->uio_resid);
+   n = ulmin(sce->fill - sce->cur, uio->uio_resid);
else
-   n = min(sce->limit - sce->cur, uio->uio_resid);
+   n = ulmin(sce->limit - sce->cur, 
uio->uio_resid);
 
-   DPRINTFN(5, ("ugenread: isoc got %d chars\n", n));
+   DPRINTFN(5, ("ugenread: isoc got %zu chars\n", n));
 
/* Copy the data to the user process. */
-   error = uiomovei(sce->cur, n, uio);
+   error = uiomove(sce->cur, n, uio);
if (error)
break;
sce->cur += n;
@@ -638,7 +639,7 @@ int
 ugen_do_write(struct ugen_softc *sc, int endpt, struct uio *uio, int flag)
 {
struct ugen_endpoint *sce = >sc_endpoints[endpt][OUT];
-   u_int32_t n;
+   size_t n;
int flags, error = 0;
char buf[UGEN_BBSIZE];
struct usbd_xfer *xfer;
@@ -671,11 +672,11 @@ ugen_do_write(struct ugen_softc *sc, int
xfer = usbd_alloc_xfer(sc->sc_udev);
if (xfer == 0)
return (EIO);
-   while ((n = min(UGEN_BBSIZE, uio->uio_resid)) != 0) {
-   error = uiomovei(buf, n, uio);
+   while ((n = ulmin(UGEN_BBSIZE, uio->uio_resid)) != 0) {
+   error = uiomove(buf, n, uio);
if (error)
 

Re: read(2) shouldn't return EFBIG

2016-02-28 Thread Stefan Kempf
Stefan Kempf wrote:
> Martin Pieuchot wrote:
> > On 28/02/16(Sun) 13:14, Martin Natano wrote:
> > > The ext2fs_read() and ffs_read() functions return EFBIG when uio_offset
> > > if smaller than zero or larger than the maxium file size. However this
> > > doesn't seem to be in accordance with the POSIX read(2) specification,
> > > which requires EINVAL for an invalid offset (< 2) and a return of 0
> > > (zero bytes transferred) for an offset that's at or paste the EOF. EFBIG
> > > actually means "File too large", which is clearly not true in both
> > > cases.
> > 
> > This seems to be coherent with MSDOSFS.
> > 
> > I'm also wondering when you say "an offset that's at or paste the
> > EOF" does that include ``uio_resid''?  I mean shouldn't you check
> > for:
> > 
> > if ((uio->uio_offset + uio->uio_resid) > fs->fs_maxfilesize)
> > ...
> 
> I think if uio->uio_offset < fs->fs_maxfilesize, then
> min(uio->uio_resid, fs->fs_maxfilesize - uio->uio_offset)
> bytes should be read.

Ehh. Not quite true. It can't read more than the size of the file.

The read-loops of ffs_read and ext2_ind_read already check
whether uio_offset is beyond the size of the file:

for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
if ((bytesinfile = DIP(ip, size) - uio->uio_offset) <= 0)
break;

If we are assured that DIP(ip, size) cannot return a bogus size
(> fs->fs_maxfilesize), the check before the loop could be

if (uio->uio_offset >= DIP(ip, size) || uio->uio_resid == 0)
return (0);
 
> Maybe the check should be if (uio_offset >= fs->fs_maxfilesize)
> though. Otherwise access flags of the inode are changed at the end
> if ext2_ind_read() and ffs_read() even though there was no real access.
>  
> > > Index: ufs/ext2fs/ext2fs_readwrite.c
> > > ===
> > > RCS file: /cvs/src/sys/ufs/ext2fs/ext2fs_readwrite.c,v
> > > retrieving revision 1.39
> > > diff -u -p -r1.39 ext2fs_readwrite.c
> > > --- ufs/ext2fs/ext2fs_readwrite.c 27 Feb 2016 18:50:38 -  1.39
> > > +++ ufs/ext2fs/ext2fs_readwrite.c 28 Feb 2016 09:10:16 -
> > > @@ -100,9 +100,9 @@ ext2_ind_read(struct vnode *vp, struct i
> > >   } else if (vp->v_type != VREG && vp->v_type != VDIR)
> > >   panic("%s: type %d", "ext2fs_read", vp->v_type);
> > >  #endif
> > > - if (e2fs_overflow(fs, 0, uio->uio_offset))
> > > - return (EFBIG);
> > > - if (uio->uio_resid == 0)
> > > + if (uio->uio_offset < 0)
> > > + return (EINVAL);
> > > + if (uio->uio_offset > fs->e2fs_maxfilesize || uio->uio_resid == 0)
> > >   return (0);
> > >  
> > >   for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
> > > @@ -163,7 +163,6 @@ ext4_ext_read(struct vnode *vp, struct i
> > >   struct ext4_extent_path path;
> > >   struct ext4_extent nex, *ep;
> > >   struct buf *bp;
> > > - size_t orig_resid;
> > >   daddr_t lbn, pos;
> > >   off_t bytesinfile;
> > >   int size, xfersize, blkoffset;
> > > @@ -171,12 +170,10 @@ ext4_ext_read(struct vnode *vp, struct i
> > >  
> > >   memset(, 0, sizeof path);
> > >  
> > > - orig_resid = uio->uio_resid;
> > > - if (orig_resid == 0)
> > > + if (uio->uio_offset < 0)
> > > + return (EINVAL);
> > > + if (uio->uio_offset > fs->e2fs_maxfilesize || uio->uio_resid == 0)
> > >   return (0);
> > > -
> > > - if (e2fs_overflow(fs, 0, uio->uio_offset))
> > > - return (EFBIG);
> > >  
> > >   while (uio->uio_resid > 0) {
> > >   if ((bytesinfile = ext2fs_size(ip) - uio->uio_offset) <= 0)
> > > Index: ufs/ffs/ffs_vnops.c
> > > ===
> > > RCS file: /cvs/src/sys/ufs/ffs/ffs_vnops.c,v
> > > retrieving revision 1.84
> > > diff -u -p -r1.84 ffs_vnops.c
> > > --- ufs/ffs/ffs_vnops.c   27 Feb 2016 18:50:38 -  1.84
> > > +++ ufs/ffs/ffs_vnops.c   28 Feb 2016 09:10:16 -
> > > @@ -214,10 +214,9 @@ ffs_read(void *v)
> > >   panic("ffs_read: type %d", vp->v_type);
> > >  #endif
> > >   fs = ip->i_fs;
> > > - if ((u_int64_t)uio->uio_offset > fs->fs_maxfilesize)
> > > - return (EFBIG);
> > > -
> > > - if (uio->uio_resid == 0)
> > > + if (uio->uio_offset < 0)
> > > + return (EINVAL);
> > > + if (uio->uio_offset > fs->fs_maxfilesize || uio->uio_resid == 0)
> > >   return (0);
> > >  
> > >   for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
> > > 
> > 



Re: read(2) shouldn't return EFBIG

2016-02-28 Thread Stefan Kempf
Martin Pieuchot wrote:
> On 28/02/16(Sun) 13:14, Martin Natano wrote:
> > The ext2fs_read() and ffs_read() functions return EFBIG when uio_offset
> > if smaller than zero or larger than the maxium file size. However this
> > doesn't seem to be in accordance with the POSIX read(2) specification,
> > which requires EINVAL for an invalid offset (< 2) and a return of 0
> > (zero bytes transferred) for an offset that's at or paste the EOF. EFBIG
> > actually means "File too large", which is clearly not true in both
> > cases.
> 
> This seems to be coherent with MSDOSFS.
> 
> I'm also wondering when you say "an offset that's at or paste the
> EOF" does that include ``uio_resid''?  I mean shouldn't you check
> for:
> 
>   if ((uio->uio_offset + uio->uio_resid) > fs->fs_maxfilesize)
>   ...

I think if uio->uio_offset < fs->fs_maxfilesize, then
min(uio->uio_resid, fs->fs_maxfilesize - uio->uio_offset)
bytes should be read.

Maybe the check should be if (uio_offset >= fs->fs_maxfilesize)
though. Otherwise access flags of the inode are changed at the end
if ext2_ind_read() and ffs_read() even though there was no real access.
 
> > Index: ufs/ext2fs/ext2fs_readwrite.c
> > ===
> > RCS file: /cvs/src/sys/ufs/ext2fs/ext2fs_readwrite.c,v
> > retrieving revision 1.39
> > diff -u -p -r1.39 ext2fs_readwrite.c
> > --- ufs/ext2fs/ext2fs_readwrite.c   27 Feb 2016 18:50:38 -  1.39
> > +++ ufs/ext2fs/ext2fs_readwrite.c   28 Feb 2016 09:10:16 -
> > @@ -100,9 +100,9 @@ ext2_ind_read(struct vnode *vp, struct i
> > } else if (vp->v_type != VREG && vp->v_type != VDIR)
> > panic("%s: type %d", "ext2fs_read", vp->v_type);
> >  #endif
> > -   if (e2fs_overflow(fs, 0, uio->uio_offset))
> > -   return (EFBIG);
> > -   if (uio->uio_resid == 0)
> > +   if (uio->uio_offset < 0)
> > +   return (EINVAL);
> > +   if (uio->uio_offset > fs->e2fs_maxfilesize || uio->uio_resid == 0)
> > return (0);
> >  
> > for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
> > @@ -163,7 +163,6 @@ ext4_ext_read(struct vnode *vp, struct i
> > struct ext4_extent_path path;
> > struct ext4_extent nex, *ep;
> > struct buf *bp;
> > -   size_t orig_resid;
> > daddr_t lbn, pos;
> > off_t bytesinfile;
> > int size, xfersize, blkoffset;
> > @@ -171,12 +170,10 @@ ext4_ext_read(struct vnode *vp, struct i
> >  
> > memset(, 0, sizeof path);
> >  
> > -   orig_resid = uio->uio_resid;
> > -   if (orig_resid == 0)
> > +   if (uio->uio_offset < 0)
> > +   return (EINVAL);
> > +   if (uio->uio_offset > fs->e2fs_maxfilesize || uio->uio_resid == 0)
> > return (0);
> > -
> > -   if (e2fs_overflow(fs, 0, uio->uio_offset))
> > -   return (EFBIG);
> >  
> > while (uio->uio_resid > 0) {
> > if ((bytesinfile = ext2fs_size(ip) - uio->uio_offset) <= 0)
> > Index: ufs/ffs/ffs_vnops.c
> > ===
> > RCS file: /cvs/src/sys/ufs/ffs/ffs_vnops.c,v
> > retrieving revision 1.84
> > diff -u -p -r1.84 ffs_vnops.c
> > --- ufs/ffs/ffs_vnops.c 27 Feb 2016 18:50:38 -  1.84
> > +++ ufs/ffs/ffs_vnops.c 28 Feb 2016 09:10:16 -
> > @@ -214,10 +214,9 @@ ffs_read(void *v)
> > panic("ffs_read: type %d", vp->v_type);
> >  #endif
> > fs = ip->i_fs;
> > -   if ((u_int64_t)uio->uio_offset > fs->fs_maxfilesize)
> > -   return (EFBIG);
> > -
> > -   if (uio->uio_resid == 0)
> > +   if (uio->uio_offset < 0)
> > +   return (EINVAL);
> > +   if (uio->uio_offset > fs->fs_maxfilesize || uio->uio_resid == 0)
> > return (0);
> >  
> > for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
> > 
> 



Re: Fix overflow check in sys/netinet6/in6.c

2016-02-27 Thread Stefan Kempf
Martin Pieuchot wrote:
> On 17/02/16(Wed) 20:38, Stefan Kempf wrote:
> > Martin Pieuchot wrote:
> > It looks like NetBSD removed the SIOCSIFALIFETIME_IN6 ioctl a long time
> > ago, along with the overflow checks, saying that this ioctl could never
> > have worked:
> > http://cvsweb.netbsd.org/bsdweb.cgi/src/sys/netinet6/in6.c?rev=1.132=text/x-cvsweb-markup_with_tag=MAIN
> 
> Indeed we should remove it as well.

Here's a diff to remove this ioctl, now that the tree is unlocked.
Nothing in base uses it.

ok?

Index: netinet6/in6.c
===
RCS file: /cvs/src/sys/netinet6/in6.c,v
retrieving revision 1.183
diff -u -p -r1.183 in6.c
--- netinet6/in6.c  21 Jan 2016 11:23:48 -  1.183
+++ netinet6/in6.c  27 Feb 2016 17:24:02 -
@@ -253,7 +253,6 @@ in6_control(struct socket *so, u_long cm
case SIOCSPFXFLUSH_IN6:
case SIOCSRTRFLUSH_IN6:
case SIOCGIFALIFETIME_IN6:
-   case SIOCSIFALIFETIME_IN6:
case SIOCGIFSTAT_IN6:
case SIOCGIFSTAT_ICMP6:
sa6 = >ifr_addr;
@@ -337,26 +336,6 @@ in6_control(struct socket *so, u_long cm
if (ia6 == NULL)
return (EADDRNOTAVAIL);
break;
-   case SIOCSIFALIFETIME_IN6:
-   {
-   struct in6_addrlifetime *lt;
-
-   if (!privileged)
-   return (EPERM);
-   if (ia6 == NULL)
-   return (EADDRNOTAVAIL);
-   /* sanity for overflow - beware unsigned */
-   lt = >ifr_ifru.ifru_lifetime;
-   if (lt->ia6t_vltime != ND6_INFINITE_LIFETIME
-&& lt->ia6t_vltime + time_second < time_second) {
-   return EINVAL;
-   }
-   if (lt->ia6t_pltime != ND6_INFINITE_LIFETIME
-&& lt->ia6t_pltime + time_second < time_second) {
-   return EINVAL;
-   }
-   break;
-   }
}
 
switch (cmd) {
@@ -425,21 +404,6 @@ in6_control(struct socket *so, u_long cm
} else
retlt->ia6t_preferred = maxexpire;
}
-   break;
-
-   case SIOCSIFALIFETIME_IN6:
-   ia6->ia6_lifetime = ifr->ifr_ifru.ifru_lifetime;
-   /* for sanity */
-   if (ia6->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) {
-   ia6->ia6_lifetime.ia6t_expire =
-   time_second + ia6->ia6_lifetime.ia6t_vltime;
-   } else
-   ia6->ia6_lifetime.ia6t_expire = 0;
-   if (ia6->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) {
-   ia6->ia6_lifetime.ia6t_preferred =
-   time_second + ia6->ia6_lifetime.ia6t_pltime;
-   } else
-   ia6->ia6_lifetime.ia6t_preferred = 0;
break;
 
case SIOCAIFADDR_IN6:
Index: netinet6/in6_var.h
===
RCS file: /cvs/src/sys/netinet6/in6_var.h,v
retrieving revision 1.59
diff -u -p -r1.59 in6_var.h
--- netinet6/in6_var.h  21 Jan 2016 11:23:48 -  1.59
+++ netinet6/in6_var.h  27 Feb 2016 17:24:02 -
@@ -405,7 +405,6 @@ struct  in6_rrenumreq {
 #define SIOCSRTRFLUSH_IN6  _IOWR('i', 80, struct in6_ifreq)
 
 #define SIOCGIFALIFETIME_IN6   _IOWR('i', 81, struct in6_ifreq)
-#define SIOCSIFALIFETIME_IN6   _IOWR('i', 82, struct in6_ifreq)
 #define SIOCGIFSTAT_IN6_IOWR('i', 83, struct in6_ifreq)
 #define SIOCGIFSTAT_ICMP6  _IOWR('i', 84, struct in6_ifreq)
 



Re: Harmful casts in ufs

2016-02-21 Thread Stefan Kempf
Todd C. Miller wrote:
> On Sun, 21 Feb 2016 11:49:55 +0100, Martin Natano wrote:
> 
> > The diff below addresses the issues you mentioned. It converts
> > mnt_maxsymlinklen to unsigned and adds a check to ffs_validate() that
> > makes sure, that fs_maxsymlinklen is >= 0. That function is called
> > during mount and on fsck. This should make sure we won't get a bogus
> > fs_maxsymlinklen from the superblock.
> 
> I think it is better to just set fsp->fs_maxsymlinklen to 0 if it
> is negative in the superblock.  We shouldn't fail to mount the
> filesystem in this case.

newfs back to 4.4BSD sets fs_maxsymlinklen to 0 for the "old" format.
A negative value should never happen unless this superblock field is
corrupted.

Should we really mount the FS in that case? If the FS was of the
"new" format, then short symlinks would store the destination path in the
inode directly. I think we'd not be able to correctly follow these symlinks
if we set fs_maxsymlinklen to 0 when encountering a negative value.
 
>  - todd



Re: diff -e: mishandling of bare dots

2016-02-21 Thread Stefan Kempf
Martin Natano wrote:
> When diff encounters a line that consists of a single dot, it emits two
> dots instead, stops the current command and emits a substitute command
> to replace the double dot with a single one. Then it restarts the
> (original) command if necessary and inserts further lines. This is done
> because a single dot on a line does have special meaning in ed. (It
> stops text insertion.)
> 
> However, there are multiple issues with the current implementation,
> resulting in mangled output:
> 
> - The line number for the substitute command should be the number of the
> most recently inserted line. diff instead uses the number of the first
> inserted line of the current hunk. The first character of that line is
> removed when applying the diff, while the superfluous dot is not.
> 
> - The line number of the restarted command is not adjusted for the
> number of lines already inserted, resulting in the reordering of lines..
> 
> - When there is a bare dot in the replacement text of a change command,
> too many lines are deleted, because a second change command is emitted.
> An append command should be emitted instead, because the target lines
> have already been removed by the first change command.
> 
> The following patch fixes all those issues.

Hmm, something still does not seem right. For example, try
diff -e on the contents below (some other tests attached):

A
B
C

vs.

W
X
.
Y
Z

> Index: diffreg.c
> ===
> RCS file: /cvs/src/usr.bin/diff/diffreg.c,v
> retrieving revision 1.90
> diff -u -p -r1.90 diffreg.c
> --- diffreg.c 26 Oct 2015 12:52:27 -  1.90
> +++ diffreg.c 13 Feb 2016 16:35:08 -
> @@ -1075,8 +1075,12 @@ proceed:
>* back and restart where we left off.
>*/
>   diff_output(".\n");
> - diff_output("%ds/.//\n", a);
> + diff_output("%ds/.//\n", a + i - 1);

This I understand. We really need to substitute the line that contains
the ..

About the changes below, is something like this the idea behind it?
The first change command has already removed lines [a,b] and replaced them
with lines [c, c + i]. So for the rest of the lines in [c + i + 1, d], we
need to make sure that they are appended after the line that follows the
inserted '.'

Can't we set b = a + i - 1 and a = b + 1 to make sure that the rest
of the 'to' file is appended after the proper line?

>   a += i;
> + if (a > b)
> + b += i;
> + else
> + b = a - 1;
>   c += i;
>   goto restart;
>   }

W
X
.
Y
Z
A
W
X
.
Y
Z
A
B
C
W
X
.
Y
Z
A
B
C
W
X
.
Y
Z
.


Re: fusefs_fhtovp() falsely rejects root file handle

2016-02-21 Thread Stefan Kempf
Martin Natano wrote:
> The fusefs_fhtovp() function makes use of the ROOTINO ((ufsino_t)2)
> define instead of using FUSE_ROOTINO ((ino_t)1), which is used
> everywhere else in the fuse filesystem. This causes a file handle for
> the filesystem root to be falsely rejected with ESTALE.
> 
> Comments?

As discussed, yes this looks good.
 
> Index: miscfs/fuse/fuse_vfsops.c
> ===
> RCS file: /cvs/src/sys/miscfs/fuse/fuse_vfsops.c,v
> retrieving revision 1.16
> diff -u -p -r1.16 fuse_vfsops.c
> --- miscfs/fuse/fuse_vfsops.c 19 Jul 2015 14:21:14 -  1.16
> +++ miscfs/fuse/fuse_vfsops.c 16 Feb 2016 09:12:29 -
> @@ -304,7 +304,7 @@ fusefs_fhtovp(struct mount *mp, struct f
>  
>   ufhp = (struct ufid *)fhp;
>   if (ufhp->ufid_len != sizeof(struct ufid) ||
> - ufhp->ufid_ino < ROOTINO)
> + ufhp->ufid_ino < FUSE_ROOTINO)
>   return (ESTALE);
>  
>   return (VFS_VGET(mp, ufhp->ufid_ino, vpp));
> 



Re: fusefs_checkexp() return value

2016-02-21 Thread Stefan Kempf
Martin Natano wrote:
> The fusefs_checkexp() function returns 0, indicating that export via NFS
> is allowed, while in fact fusefs doesn't support NFS at all. (There is
> a vfs_export() call missing in fusefs_mount() and maybe other stuff too.)
> 
> Furthermore, it does so without setting *extflagsp and *credanonp, which
> results in an uvm fault when trying to NFS mount a directory of a fusefs
> file system.
> 
> tmpfs and udf also dont't support NFS, returning EOPNOTSUPP and EACCESS
> respectively. In my opinion EOPNOTSUPP is preferrable, because it can be
> parsed as "this filesystem doesn't support NFS".
> 
> Any comments?

Looks good. EOPNOTSUPP sounds more reasonable to me as well.
 
> Index: miscfs/fuse/fuse_vfsops.c
> ===
> RCS file: /cvs/src/sys/miscfs/fuse/fuse_vfsops.c,v
> retrieving revision 1.16
> diff -u -p -r1.16 fuse_vfsops.c
> --- miscfs/fuse/fuse_vfsops.c 19 Jul 2015 14:21:14 -  1.16
> +++ miscfs/fuse/fuse_vfsops.c 16 Feb 2016 09:02:40 -
> @@ -363,5 +363,5 @@ int
>  fusefs_checkexp(struct mount *mp, struct mbuf *nam, int *extflagsp,
>  struct ucred **credanonp)
>  {
> - return (0);
> + return (EOPNOTSUPP);
>  }
> 
> natano
> 



Re: undefined behaviour in add_entropy_words()

2016-02-18 Thread Stefan Kempf
Martin Natano wrote:
> Hi,
> 
> The add_entropy_words() function performs a right shift by
> (32 - entropy_input_rotate) bits, with entropy_input_rotate being an
> integer between [0..31]. This can lead to a shift of 32 on a 32 bit
> value, which is undefined behaviour in C. The standard says this: "If
> the value of the right operand is negative or is greater than or equal
> to the width of the promoted left operand, the behaviour is undefined."
> In our case the value is 32 and the width of the promoted left operand
> is 32 too, thus the behaviour is undefined.
> 
> As a matter of fact the expression doesn't yield the (probably) expected
> result of 0 on i386 and amd64 machines. The reason being, that the shift
> exponent is truncated to 5 bits on x86.
> 
> natano@obsd:~$ uname -a
> OpenBSD obsd.my.domain 5.9 GENERIC.MP#1862 amd64
> natano@obsd:~$ cat test.c
> #include 
> 
> int
> main(void)
> {
> unsigned int x = 0x;
> printf("x >> 32: 0x%x\n", x >> 32);
> return 0;
> }
> natano@obsd:~$ cc test.c
> test.c: In function 'main':
> test.c:7: warning: right shift count >= width of type
> natano@obsd:~$ ./a.out
> x >> 32: 0x
> natano@obsd:~$
> 
> On the other hand ppc truncates the shift exponent to 6 bits. (I didn't
> try it, but
> http://blog.llvm.org/2011/05/what-every-c-programmer-should-know.html
> says so).
> 
> In the specific instance below, the differing behaviour between x86 and
> pcc doesn't matter, because of the equality (x | x) == (x | 0) == x.
> 
> However, I think it might be still worth fixing this, because we can't
> rely on future versions of gcc and other compilers retaining this
> behaviour. The behaviour is undefined, so the compiler can do whatever
> it wants (insert obligatory reference to nasal demons here).
 
I think we don't mix declarations and code.
Would this be an option?

diff --git a/dev/rnd.c b/dev/rnd.c
index 819ce0d..0f57b1b 100644
--- a/dev/rnd.c
+++ b/dev/rnd.c
@@ -421,7 +421,7 @@ add_entropy_words(const u_int32_t *buf, u_int n)
 
for (; n--; buf++) {
u_int32_t w = (*buf << entropy_input_rotate) |
-   (*buf >> (32 - entropy_input_rotate));
+   (*buf >> ((32 - entropy_input_rotate) & 31));
u_int i = entropy_add_ptr =
(entropy_add_ptr - 1) & POOLMASK;
/*
 
> Index: dev/rnd.c
> ===
> RCS file: /cvs/src/sys/dev/rnd.c,v
> retrieving revision 1.178
> diff -u -p -u -r1.178 rnd.c
> --- dev/rnd.c 8 Jan 2016 07:54:02 -   1.178
> +++ dev/rnd.c 31 Jan 2016 10:11:17 -
> @@ -420,8 +420,9 @@ add_entropy_words(const u_int32_t *buf, 
>   };
>  
>   for (; n--; buf++) {
> - u_int32_t w = (*buf << entropy_input_rotate) |
> - (*buf >> (32 - entropy_input_rotate));
> + u_int32_t w = *buf << entropy_input_rotate;
> + if (entropy_input_rotate > 0)
> + w |= *buf >> (32 - entropy_input_rotate);
>   u_int i = entropy_add_ptr =
>   (entropy_add_ptr - 1) & POOLMASK;
>   /*
> 
> cheers,
> natano
> 



Re: Harmful casts in ufs

2016-02-18 Thread Stefan Kempf
Todd C. Miller wrote:
> On Wed, 17 Feb 2016 10:22:04 +0100, Martin Natano wrote:
> 
> > Casting the result of ext2fs_size() and DIP(ip, size) to int potentially
> > truncates the result. Issue found by Stefan Kempf, see
> > https://marc.info/?l=openbsd-tech=145495905416536 .
> > 
> > While there I also removed the cast in the ext2fs_chmod() call, because
> > the function expects a mode_t argument anyway.
> 
> There is currently code that checks for mnt_maxsymlinklen <= 0.
> Removing the cast will cause other problems for ffs if the maxsymlinklen
> value is negative.  I don't think it is safe to make this change
> unless mnt_maxsymlinklen is made unsigned in struct mount and a
> check is added to the assignment of mnt_maxsymlinklen from
> fs_maxsymlinklen in ufs/ffs/ffs_vfsops.c to avoid assigning a
> negative value.

That makes sense. Those <= 0 checks look whether the FFS is in the
"old" format. When creating an old format FFS, newfs creates a superblock
with fs_maxsymlinklen of 0. A negative fs_maxsymlinklen should never
happen except for bogus superblocks. So checking for this when mounting
the filesystem looks reasonable.

>  - todd
> 



Re: Fix overflow check in sys/netinet6/in6.c

2016-02-17 Thread Stefan Kempf
Martin Pieuchot wrote:
> On 13/02/16(Sat) 18:51, Stefan Kempf wrote:
> > Some thoughts about this:
> > 
> > If this particular type of undefined behavior is really a concern: maybe
> > looking for bounds/overflow checks that are incorrect besides undefined
> > behavior first is a better approach. A good way of fixing those will
> > be found, which could then be applied to the "just undefined behavior"
> > cases.
> > 
> > Details below.
> > 
> > Michael McConville wrote:
> > > time_second is a time_t, which we define as int64_t. The other operands
> > > used are of type uint32_t. Therefore, these checks get promoted to
> > > int64_t and the overflow being tested is undefined because it uses
> > > signed arithmetic.
> > > 
> > > I think that the below diff fixes the overflow check. However, I'm
> > > pretty tired at the moment and this is hard to do right, so review is
> > > appreciated.
> >  
> > If you know that lt->ia6t_[pv]time will stay an uint32_t forever,
> > then isn't checking for (time_second > INT64_MAX - lt->ia6t_vltime)
> > enough? The right side of the comparison will always be > 0 then.
> > If we somehow had time_second < 0, then your sanity check would still
> > work without checking for time_second > 0.
> > Don't think we ever have time_second < 0 though, since it's the
> > seconds since the Epoch.
> 
> Does your reasoning also apply to time_uptime?  Because it might make
> sense to first convert this code to use time_uptime first to avoid leaps
> that might be triggered by clock_settime(2) or settimeofday(2) for example.

I think it holds. time_uptime and time_second are both updated in
tc_windup(), and as far as I can tell after a quick read of
the code, time_uptime is only added to, as is reasonable to expect :-)

> Such work as been done in NetBSD as far as I know.

The netinet/in6.c of NetBSD uses time_uptime. Some other things I noted
while looking at NetBSD below. Can't judge how these findings
apply to our tree though.

But mentioning lifetimes and expire times, your time_yptime suggestion
sounds reasonable to me in this context.

It looks like NetBSD removed the SIOCSIFALIFETIME_IN6 ioctl a long time
ago, along with the overflow checks, saying that this ioctl could never
have worked:
http://cvsweb.netbsd.org/bsdweb.cgi/src/sys/netinet6/in6.c?rev=1.132=text/x-cvsweb-markup_with_tag=MAIN

These assignments are also there in in6_update_ifa(), but without
overflow checks, it seems. This routine is called when doing a
SIOCAIFADDR_IN6 ioctl(). NetBSD has them also.

if (ia6->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) {
ia6->ia6_lifetime.ia6t_expire =
time_second + ia6->ia6_lifetime.ia6t_vltime;
} else
ia6->ia6_lifetime.ia6t_expire = 0;
if (ia6->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) {
ia6->ia6_lifetime.ia6t_preferred =
time_second + ia6->ia6_lifetime.ia6t_pltime;



Re: Fix overflow check in sys/netinet6/in6.c

2016-02-13 Thread Stefan Kempf
Some thoughts about this:

If this particular type of undefined behavior is really a concern: maybe
looking for bounds/overflow checks that are incorrect besides undefined
behavior first is a better approach. A good way of fixing those will
be found, which could then be applied to the "just undefined behavior"
cases.

Details below.

Michael McConville wrote:
> time_second is a time_t, which we define as int64_t. The other operands
> used are of type uint32_t. Therefore, these checks get promoted to
> int64_t and the overflow being tested is undefined because it uses
> signed arithmetic.
> 
> I think that the below diff fixes the overflow check. However, I'm
> pretty tired at the moment and this is hard to do right, so review is
> appreciated.
 
If you know that lt->ia6t_[pv]time will stay an uint32_t forever,
then isn't checking for (time_second > INT64_MAX - lt->ia6t_vltime)
enough? The right side of the comparison will always be > 0 then.
If we somehow had time_second < 0, then your sanity check would still
work without checking for time_second > 0.
Don't think we ever have time_second < 0 though, since it's the
seconds since the Epoch.

In general, I'm not sure if rewriting these checks using *_MAX constants
is the best way to do it, because we'd have to review all of them once the
types of variables involved in the check change.

Clang and newer gcc versions have __builtin_add_overflow, but it's
compiler specific.

One final note, if we really care about that overflow check being
performed in in6_control:
(So this is about the overflow check here itself, not your diff).

The assignments:
ia6->ia6_lifetime.ia6t_expire =
time_second + ia6->ia6_lifetime.ia6t_vltime;
ia6->ia6_lifetime.ia6t_preferred =
time_second + ia6->ia6_lifetime.ia6t_pltime;

are done after the overflow checks. time_second is updated by
hardclock(), which is triggered by the timer interrupt, AFAIK.
If this code runs with interrupts enabled, time_second can yield
a different value for every read access. So the computation might
still overflow in this case.

Practically this should only overflow now when somebody sets the clock
far into the future.

I suppose this check comes from times where time_t was 32 bit.
 
> Index: netinet6/in6.c
> ===
> RCS file: /cvs/src/sys/netinet6/in6.c,v
> retrieving revision 1.183
> diff -u -p -r1.183 in6.c
> --- netinet6/in6.c21 Jan 2016 11:23:48 -  1.183
> +++ netinet6/in6.c12 Feb 2016 19:44:10 -
> @@ -70,6 +70,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -348,11 +349,13 @@ in6_control(struct socket *so, u_long cm
>   /* sanity for overflow - beware unsigned */
>   lt = >ifr_ifru.ifru_lifetime;
>   if (lt->ia6t_vltime != ND6_INFINITE_LIFETIME
> -  && lt->ia6t_vltime + time_second < time_second) {
> + && (time_second > 0
> + && time_second > INT64_MAX - lt->ia6t_vltime)) {
>   return EINVAL;
>   }
>   if (lt->ia6t_pltime != ND6_INFINITE_LIFETIME
> -  && lt->ia6t_pltime + time_second < time_second) {
> + && (time_second > 0
> + && time_second > INT64_MAX - lt->ia6t_pltime)) {
>   return EINVAL;
>   }
>   break;
> 



Re: ufs/ffs/ext2fs uiomove() conversion

2016-02-08 Thread Stefan Kempf
Martin Natano wrote:
> Below the conversion to uiomovei() for ufs. While there I changed all
> instances of 'blkoffset', 'size' and 'xfersize' that where declared as
> long integers to be plain integers instead for consistency with the
> surrounding code. These variables are limited by fs_bsize and e2fs_bsize
> respectively, which are int32_t. So, no overflow should be introduced bu
> that change. Also, 'size' is passed to bread(), which expects an integer
> parameter.

These all look good. It's easy to see that these variables have all
small max. values.
 
One unrelated thing noted while reviewing:

ufs/ext2fs/ext2fs_readwrite.c:
static int
ext2_ind_read(struct vnode *vp, struct inode *ip, struct m_ext2fs *fs,
struct uio *uio)
{
...

if (vp->v_type == VLNK) {
if ((int)ext2fs_size(ip) < vp->v_mount->mnt_maxsymlinklen ||
^

(vp->v_mount->mnt_maxsymlinklen == 0 &&
ip->i_e2fs_nblock == 0))
panic("%s: short symlink", "ext2fs_read");

ufs/ffs/ffs_vnops.c:
int
ffs_read(void *v)
{
...
if (vp->v_type == VLNK) {
if ((int)DIP(ip, size) < vp->v_mount->mnt_maxsymlinklen ||

(vp->v_mount->mnt_maxsymlinklen == 0 &&
 DIP(ip, blocks) == 0))
panic("ffs_read: short symlink");

This code checks that the filesystem read routines are not invoked on
symlinks where the destination path fits into the inode itself.
We should get rid of these truncating casts in a separate diff.

> Index: ufs/ext2fs/ext2fs_lookup.c
> ===
> RCS file: /cvs/src/sys/ufs/ext2fs/ext2fs_lookup.c,v
> retrieving revision 1.39
> diff -u -p -u -r1.39 ext2fs_lookup.c
> --- ufs/ext2fs/ext2fs_lookup.c14 Mar 2015 03:38:52 -  1.39
> +++ ufs/ext2fs/ext2fs_lookup.c7 Feb 2016 17:48:03 -
> @@ -177,7 +177,7 @@ ext2fs_readdir(void *v)
>   break;
>   }
>   dstd.d_off = off + e2d_reclen;
> - if ((error = uiomovei((caddr_t), dstd.d_reclen, 
> uio)) != 0) {
> + if ((error = uiomove((caddr_t), dstd.d_reclen, 
> uio)) != 0) {
>   break;
>   }
>   off = off + e2d_reclen;
> Index: ufs/ext2fs/ext2fs_readwrite.c
> ===
> RCS file: /cvs/src/sys/ufs/ext2fs/ext2fs_readwrite.c,v
> retrieving revision 1.36
> diff -u -p -u -r1.36 ext2fs_readwrite.c
> --- ufs/ext2fs/ext2fs_readwrite.c 12 Jan 2016 11:44:21 -  1.36
> +++ ufs/ext2fs/ext2fs_readwrite.c 7 Feb 2016 17:48:03 -
> @@ -87,7 +87,7 @@ ext2_ind_read(struct vnode *vp, struct i
>   struct buf *bp;
>   daddr_t lbn, nextlbn;
>   off_t bytesinfile;
> - long size, xfersize, blkoffset;
> + int size, xfersize, blkoffset;
>   int error;
>  
>  #ifdef DIAGNOSTIC
> @@ -145,7 +145,7 @@ ext2_ind_read(struct vnode *vp, struct i
>   break;
>   xfersize = size;
>   }
> - error = uiomovei((char *)bp->b_data + blkoffset, xfersize, uio);
> + error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
>   if (error)
>   break;
>   brelse(bp);
> @@ -168,7 +168,7 @@ ext4_ext_read(struct vnode *vp, struct i
>   size_t orig_resid;
>   daddr_t lbn, pos;
>   off_t bytesinfile;
> - long size, xfersize, blkoffset;
> + int size, xfersize, blkoffset;
>   int error, cache_type;
>  
>   memset(, 0, sizeof path);
> @@ -225,7 +225,7 @@ ext4_ext_read(struct vnode *vp, struct i
>   }
>   xfersize = size;
>   }
> - error = uiomovei(bp->b_data + blkoffset, xfersize, uio);
> + error = uiomove(bp->b_data + blkoffset, xfersize, uio);
>   brelse(bp);
>   if (error)
>   return (error);
> @@ -248,7 +248,8 @@ ext2fs_write(void *v)
>   int32_t lbn;
>   off_t osize;
>   int blkoffset, error, flags, ioflag, size, xfersize;
> - ssize_t resid, overrun;
> + size_t resid;
> + ssize_t overrun;
>  
>   ioflag = ap->a_ioflag;
>   uio = ap->a_uio;
> @@ -324,8 +325,7 @@ ext2fs_write(void *v)
>   if (size < xfersize)
>   xfersize = size;
>  
> - error =
> - uiomovei((char *)bp->b_data + blkoffset, (int)xfersize, 
> uio);
> + error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
>  #if 0
>   if (ioflag & IO_NOCACHE)
>   bp->b_flags |= B_NOCACHE;
> Index: ufs/ext2fs/ext2fs_vnops.c
> ===
> RCS file: 

Re: bktr uiomove() conversion

2016-02-07 Thread Stefan Kempf
Martin Natano wrote:
> Below the conversion from uiomovei() to uiomove() for bktr. The patch
> also replaces two occurrences of uio->uio_iov->iov_len with
> uio->uio_resid. I don't see a reason why bktr should inspect iov_len
> directly.

Looks good. As for computing count, we have 0 <= bktr->rows <= 0x7fe,
0 <= bktr->cols <= 0x3fe and pixfmt_table[bktr->pixfmt[.public.Bpp is 4
at most. So count is always positive even with an int and changing
it to size_t and calling uiomove keeps the same behavior as today.

Checking uio_resid and using ulmin also makes sense. bktr->vbisize is
bounded by VBI_BUFFER_SIZE and uio_resid is expected to be the sum
of the iov_len when video_read or vbi_read is called.
 
> Index: dev/pci/bktr/bktr_core.c
> ===
> RCS file: /cvs/src/sys/dev/pci/bktr/bktr_core.c,v
> retrieving revision 1.36
> diff -u -p -u -r1.36 bktr_core.c
> --- dev/pci/bktr/bktr_core.c  14 Mar 2015 03:38:49 -  1.36
> +++ dev/pci/bktr/bktr_core.c  24 Jan 2016 11:25:49 -
> @@ -993,7 +993,7 @@ int
>  video_read(bktr_ptr_t bktr, int unit, dev_t dev, struct uio *uio)
>  {
>  int status;
> -int count;
> +size_t  count;
>  
>  
>   if (bktr->bigbuf == 0)  /* no frame buffer allocated (ioctl failed) */
> @@ -1008,7 +1008,7 @@ video_read(bktr_ptr_t bktr, int unit, de
>   count = bktr->rows * bktr->cols *
>   pixfmt_table[ bktr->pixfmt ].public.Bpp;
>  
> - if ((int) uio->uio_iov->iov_len < count)
> + if (uio->uio_resid < count)
>   return( EINVAL );
>  
>   bktr->flags &= ~(METEOR_CAP_MASK | METEOR_WANT_MASK);
> @@ -1027,7 +1027,7 @@ video_read(bktr_ptr_t bktr, int unit, de
>  
>   status = tsleep(BKTR_SLEEP, BKTRPRI, "captur", 0);
>   if (!status)/* successful capture */
> - status = uiomovei((caddr_t)bktr->bigbuf, count, uio);
> + status = uiomove((caddr_t)bktr->bigbuf, count, uio);
>   else
>   printf ("%s: read: tsleep error %d\n",
>   bktr_name(bktr), status);
> @@ -1047,7 +1047,7 @@ video_read(bktr_ptr_t bktr, int unit, de
>  int
>  vbi_read(bktr_ptr_t bktr, struct uio *uio, int ioflag)
>  {
> - int readsize, readsize2;
> + size_t  readsize, readsize2;
>   int status;
>  
>  
> @@ -1067,22 +1067,20 @@ vbi_read(bktr_ptr_t bktr, struct uio *ui
>   /* We cannot read more bytes than there are in
>* the circular buffer
>*/
> - readsize = (int)uio->uio_iov->iov_len;
> -
> - if (readsize > bktr->vbisize) readsize = bktr->vbisize;
> + readsize = ulmin(uio->uio_resid, bktr->vbisize);
>  
>   /* Check if we can read this number of bytes without having
>* to wrap around the circular buffer */
> - if((bktr->vbistart + readsize) >= VBI_BUFFER_SIZE) {
> + if (readsize >= VBI_BUFFER_SIZE - bktr->vbistart) {
>   /* We need to wrap around */
>  
>   readsize2 = VBI_BUFFER_SIZE - bktr->vbistart;
> - status = uiomovei((caddr_t)bktr->vbibuffer + bktr->vbistart, 
> readsize2, uio);
> + status = uiomove((caddr_t)bktr->vbibuffer + bktr->vbistart, 
> readsize2, uio);
>   if (status == 0)
> - status = uiomovei((caddr_t)bktr->vbibuffer, (readsize - 
> readsize2), uio);
> + status = uiomove((caddr_t)bktr->vbibuffer, (readsize - 
> readsize2), uio);
>   } else {
>   /* We do not need to wrap around */
> - status = uiomovei((caddr_t)bktr->vbibuffer + bktr->vbistart, 
> readsize, uio);
> + status = uiomove((caddr_t)bktr->vbibuffer + bktr->vbistart, 
> readsize, uio);
>   }
>  
>   /* Update the number of bytes left to read */
> 
> cheers,
> natano
> 



Re: bpf uiomove() conversion

2016-02-06 Thread Stefan Kempf
Martin Natano wrote:
> Below the uiomove() conversion for bpf. bd_hlen is a signed integer, but
> can't be negative, because it contains the size of a buffer. Thus, the
> conversion to size_t is ok.

Looks good. bd_hlen is assigned from bd_slen which is in turn computed
in bpf_catchpacket(). This function makes sure that bd_slen does not
exceed the size of a buffer.
 
> Index: net/bpf.c
> ===
> RCS file: /cvs/src/sys/net/bpf.c,v
> retrieving revision 1.132
> diff -u -p -u -r1.132 bpf.c
> --- net/bpf.c 7 Jan 2016 05:31:17 -   1.132
> +++ net/bpf.c 9 Jan 2016 11:03:33 -
> @@ -212,7 +212,7 @@ bpf_movein(struct uio *uio, u_int linkty
>   m->m_len = len;
>   *mp = m;
>  
> - error = uiomovei(mtod(m, caddr_t), len, uio);
> + error = uiomove(mtod(m, caddr_t), len, uio);
>   if (error)
>   goto bad;
>  
> @@ -488,7 +488,7 @@ bpfread(dev_t dev, struct uio *uio, int 
>* We know the entire buffer is transferred since
>* we checked above that the read buffer is bpf_bufsize bytes.
>*/
> - error = uiomovei(d->bd_hbuf, d->bd_hlen, uio);
> + error = uiomove(d->bd_hbuf, d->bd_hlen, uio);
>  
>   s = splnet();
>   d->bd_fbuf = d->bd_hbuf;
> 
> cheers,
> natano
> 



Re: nfs uiomove() conversion

2016-02-06 Thread Stefan Kempf
Martin Natano wrote:
> Below the uiomove() conversion for nfs. I didn't change the type of 'n'
> to be size_t, because it never exceeds the maximum rpc size (nm_rsize),
> which is an integer too. (Also, to avoid unnecessary code churn.)

Looks good. It's easy to see in the code that (0 < on < biosize), so we
have (biosize - on > 0). Since these variables are ints, the result of
ulmin will fit into an int also.

Remaining comments see inline.
 
> Index: nfs/nfs_bio.c
> ===
> RCS file: /cvs/src/sys/nfs/nfs_bio.c,v
> retrieving revision 1.80
> diff -u -p -u -r1.80 nfs_bio.c
> --- nfs/nfs_bio.c 14 Mar 2015 03:38:52 -  1.80
> +++ nfs/nfs_bio.c 27 Jan 2016 20:46:53 -
> @@ -177,7 +177,7 @@ again:
>   return (error);
>   }
>   }
> - n = min((unsigned)(biosize - on), uio->uio_resid);
> + n = ulmin(biosize - on, uio->uio_resid);
>   offdiff = np->n_size - uio->uio_offset;
>   if (offdiff < (off_t)n)
>   n = (int)offdiff;
> @@ -211,7 +211,7 @@ again:
>   return (error);
>   }
>   }
> - n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid);
> + n = ulmin(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid);
>   got_buf = 1;
>   on = 0;
>   break;
> @@ -223,7 +223,7 @@ again:
>   if (n > 0) {
>   if (!baddr)
>   baddr = bp->b_data;
> - error = uiomovei(baddr + on, (int)n, uio);
> + error = uiomove(baddr + on, n, uio);
>   }
>  
>   if (vp->v_type == VLNK)
> @@ -318,7 +318,7 @@ nfs_write(void *v)
>   nfsstats.biocache_writes++;
>   lbn = uio->uio_offset / biosize;
>   on = uio->uio_offset & (biosize-1);
> - n = min((unsigned)(biosize - on), uio->uio_resid);
> + n = ulmin(biosize - on, uio->uio_resid);
>   bn = lbn * (biosize / DEV_BSIZE);
>  again:
>   bp = nfs_getcacheblk(vp, bn, biosize, p);
> @@ -349,7 +349,7 @@ again:
>   goto again;
>   }
>  
> - error = uiomovei((char *)bp->b_data + on, n, uio);
> + error = uiomove((char *)bp->b_data + on, n, uio);
>   if (error) {
>   bp->b_flags |= B_ERROR;
>   brelse(bp);
> @@ -590,7 +590,7 @@ nfs_doio(struct buf *bp, struct proc *p)
>   len = np->n_size - off_t)bp->b_blkno) << DEV_BSHIFT)
>   + diff);
>   if (len > 0) {
> - len = min(len, uiop->uio_resid);
> + len = ulmin(len, uiop->uio_resid);
>   memset((char *)bp->b_data + diff, 0, len);
>   bp->b_validend = diff + len;
>   } else
> Index: nfs/nfs_subs.c
> ===
> RCS file: /cvs/src/sys/nfs/nfs_subs.c,v
> retrieving revision 1.128
> diff -u -p -u -r1.128 nfs_subs.c
> --- nfs/nfs_subs.c16 Jun 2015 11:09:40 -  1.128
> +++ nfs/nfs_subs.c27 Jan 2016 20:46:58 -
> @@ -712,8 +712,8 @@ nfsm_uiotombuf(struct mbuf **mp, struct 
>   uiop->uio_rw = UIO_WRITE;
>  
>   while (len) {
> - xfer = min(len, M_TRAILINGSPACE(mb));
> - uiomovei(mb_offset(mb), xfer, uiop);
> + xfer = ulmin(len, M_TRAILINGSPACE(mb));
> + uiomove(mb_offset(mb), xfer, uiop);
>   mb->m_len += xfer;
>   len -= xfer;
if (len > 0) {

Ok also. len is a size_t and we already had the M_TRAILINGSPACE
returns a positive value.

> Index: nfs/nfs_vnops.c
> ===
> RCS file: /cvs/src/sys/nfs/nfs_vnops.c,v
> retrieving revision 1.166
> diff -u -p -u -r1.166 nfs_vnops.c
> --- nfs/nfs_vnops.c   22 Dec 2015 21:36:57 -  1.166
> +++ nfs/nfs_vnops.c   27 Jan 2016 20:47:05 -
> @@ -2032,7 +2032,7 @@ nfs_readdir(void *v)
>   break;
>   }
>  
> - if ((error = uiomovei(dp, dp->d_reclen, uio)))
> + if ((error = uiomove(dp, dp->d_reclen, uio)))

Simple because d_reclen is unsigned.



Re: Signed overflow in ufs i_modrev calculation

2016-02-03 Thread Stefan Kempf
Mike Belopuhov wrote:
> Any OKs or objections to this diff?  This looks solid to me, FWIW.

Looks good to me as well.

ok stefan@
 
> On Wed, Jan 27, 2016 at 09:52 +0100, Martin Natano wrote:
> > In ufs, the calculation of i_modrev can produce signed overflow on 32
> > bit architectures (found on i386). The tv.tv_usec * 4294 calculation is
> > designed to move the microseconds part of a struct timeval to the upper
> > bits of an unsigned(!) 32 bit value to make room for simple i_modrev
> > increments, but the calculation is performed signed, causing overflow.
> > The diff below gets rid of the overflow by casting to unsigned first.
> > 
> > While there I replaced the union _qcvt/SETHIGH/SETLOW dance with simple
> > bitshift operations.
> > 
> > Index: ufs/ext2fs/ext2fs_subr.c
> > ===
> > RCS file: /cvs/src/sys/ufs/ext2fs/ext2fs_subr.c,v
> > retrieving revision 1.33
> > diff -u -p -u -r1.33 ext2fs_subr.c
> > --- ufs/ext2fs/ext2fs_subr.c14 Mar 2015 03:38:52 -  1.33
> > +++ ufs/ext2fs/ext2fs_subr.c27 Jan 2016 08:26:05 -
> > @@ -49,25 +49,6 @@
> >  #include 
> >  #include 
> >  
> > -union _qcvt {
> > -   int64_t qcvt;
> > -   int32_t val[2];
> > -};
> > -
> > -#define SETHIGH(q, h) {\
> > -   union _qcvt tmp;\
> > -   tmp.qcvt = (q); \
> > -   tmp.val[_QUAD_HIGHWORD] = (h);  \
> > -   (q) = tmp.qcvt; \
> > -}
> > -
> > -#define SETLOW(q, l) { \
> > -   union _qcvt tmp;\
> > -   tmp.qcvt = (q); \
> > -   tmp.val[_QUAD_LOWWORD] = (l);   \
> > -   (q) = tmp.qcvt; \
> > -}
> > -
> >  #ifdef _KERNEL
> >  
> >  /*
> > @@ -220,8 +201,8 @@ ext2fs_vinit(struct mount *mp, struct vo
> >  
> > /* Initialize modrev times */
> > getmicrouptime();
> > -   SETHIGH(ip->i_modrev, tv.tv_sec);
> > -   SETLOW(ip->i_modrev, tv.tv_usec * 4294);
> > +   ip->i_modrev = (u_quad_t)tv.tv_sec << 32;
> > +   ip->i_modrev |= (u_quad_t)tv.tv_usec * 4294;
> >  
> > *vpp = vp;
> >  
> > Index: ufs/ufs/ufs_vnops.c
> > ===
> > RCS file: /cvs/src/sys/ufs/ufs/ufs_vnops.c,v
> > retrieving revision 1.123
> > diff -u -p -u -r1.123 ufs_vnops.c
> > --- ufs/ufs/ufs_vnops.c 8 Dec 2015 15:31:01 -   1.123
> > +++ ufs/ufs/ufs_vnops.c 27 Jan 2016 08:26:10 -
> > @@ -78,24 +78,6 @@ int filt_ufswrite(struct knote *, long);
> >  int filt_ufsvnode(struct knote *, long);
> >  void filt_ufsdetach(struct knote *);
> >  
> > -union _qcvt {
> > -   int64_t qcvt;
> > -   int32_t val[2];
> > -};
> > -
> > -#define SETHIGH(q, h) { \
> > -   union _qcvt tmp; \
> > -   tmp.qcvt = (q); \
> > -   tmp.val[_QUAD_HIGHWORD] = (h); \
> > -   (q) = tmp.qcvt; \
> > -}
> > -#define SETLOW(q, l) { \
> > -   union _qcvt tmp; \
> > -   tmp.qcvt = (q); \
> > -   tmp.val[_QUAD_LOWWORD] = (l); \
> > -   (q) = tmp.qcvt; \
> > -}
> > -
> >  /*
> >   * A virgin directory (no blushing please).
> >   */
> > @@ -1879,8 +1861,8 @@ ufs_vinit(struct mount *mntp, struct vop
> >  * Initialize modrev times
> >  */
> > getmicrouptime();
> > -   SETHIGH(ip->i_modrev, mtv.tv_sec);
> > -   SETLOW(ip->i_modrev, mtv.tv_usec * 4294);
> > +   ip->i_modrev = (u_quad_t)mtv.tv_sec << 32;
> > +   ip->i_modrev |= (u_quad_t)mtv.tv_usec * 4294;
> > *vpp = vp;
> > return (0);
> >  }
> > 
> > cheers,
> > natano
> > 
> 



Re: tmpfs uiomove() conversion

2016-02-03 Thread Stefan Kempf
Martin Natano wrote:
> Stefan Kempf wrote:
> > I'm a bit uneasy though with passing signed values as-is to uiomove().
> > Can we somehow make it explicit that we know that the uiomove() argument is
> > >= 0?
> > 
> > Changing types all over the place would be too much churn though.
> > 
> > I'm leaning towards an explicit size_t cast for signed size arguments as
> > an annotation, e.g.
> > uiomove(buf, (size_t)signed_value, uio);
> 
> I agree, a cast like that would make the intent clear to the reader. See
> the regenerated diff below.
> 
> 
> > And a check in uiomove(). If a negative value gets passed in by
> > accident, it will be caught.
> > 
> > Thoughts?
> >  
> > Index: kern_subr.c
> > ===
> > RCS file: /cvs/src/sys/kern/kern_subr.c,v
> > retrieving revision 1.45
> > diff -u -p -U10 -r1.45 kern_subr.c
> > --- kern_subr.c 11 Dec 2015 16:07:02 -  1.45
> > +++ kern_subr.c 17 Jan 2016 10:56:11 -
> > @@ -46,20 +46,23 @@
> >  #include 
> >  
> >  int
> >  uiomove(void *cp, size_t n, struct uio *uio)
> >  {
> > struct iovec *iov;
> > size_t cnt;
> > int error = 0;
> > struct proc *p;
> >  
> > +   if (n > SSIZE_MAX)
> > +   return (EINVAL);
> > +
> > p = uio->uio_procp;
> >  
> >  #ifdef DIAGNOSTIC
> > if (uio->uio_rw != UIO_READ && uio->uio_rw != UIO_WRITE)
> > panic("uiomove: mode");
> > if (uio->uio_segflg == UIO_USERSPACE && p != curproc)
> > panic("uiomove: proc");
> >  #endif
> > while (n > 0 && uio->uio_resid) {
> > iov = uio->uio_iov;
> 
> I don't think this will fix the underlying problem. It is not possible
> to detect all types of overflow inside of uiomove(). Let's take a look
> at our tmpfs example: An off_t value is passed to uimove(). On i386,
> size_t is an unsigned 32 bit integer and off_t is a signed 64 bit
> integer.  When the off_t value is truncated on conversion, the
> 'if (n > SSIZE_MAX)' condition might trigger, or not, depending on the
> exact value of the original off_t value, resulting in unreliable
> behaviour.

D'oh. Yes, you are right. It's too late for such a check in uiomove().
Let's go with your second diff then.
 
> Unfortunately, I think the best we can do is to audit and fix all
> uiomove() callers, which we already do for the purpose of the
> conversion.
> 
> cheers,
> natano
> 
> 
> Index: tmpfs_subr.c
> ===
> RCS file: /cvs/src/sys/tmpfs/tmpfs_subr.c,v
> retrieving revision 1.14
> diff -u -p -u -r1.14 tmpfs_subr.c
> --- tmpfs_subr.c  17 Apr 2015 04:43:21 -  1.14
> +++ tmpfs_subr.c  17 Jan 2016 12:05:59 -
> @@ -740,7 +740,7 @@ tmpfs_dir_getdotents(tmpfs_node_t *node,
>   return EJUSTRETURN;
>   }
>  
> - if ((error = uiomovei(dp, dp->d_reclen, uio)) != 0) {
> + if ((error = uiomove(dp, dp->d_reclen, uio)) != 0) {
>   return error;
>   }
>  
> @@ -837,7 +837,7 @@ tmpfs_dir_getdents(tmpfs_node_t *node, s
>   }
>  
>   /* Copy out the directory entry and continue. */
> - error = uiomovei(, dent.d_reclen, uio);
> + error = uiomove(, dent.d_reclen, uio);
>   if (error) {
>   break;
>   }
> @@ -1225,7 +1225,7 @@ tmpfs_uiomove(tmpfs_node_t *node, struct
>   if (pgoff + len < PAGE_SIZE) {
>   va = tmpfs_uio_lookup(node, pgnum);
>   if (va != (vaddr_t)NULL)
> - return uiomovei((void *)va + pgoff, len, uio);
> + return uiomove((void *)va + pgoff, len, uio);
>   }
>  
>   if (len >= TMPFS_UIO_MAXBYTES) {
> @@ -1249,7 +1249,7 @@ tmpfs_uiomove(tmpfs_node_t *node, struct
>   return error;
>   }
>  
> - error = uiomovei((void *)va + pgoff, sz, uio);
> + error = uiomove((void *)va + pgoff, sz, uio);
>   if (error == 0 && pgoff + sz < PAGE_SIZE)
>   tmpfs_uio_cache(node, pgnum, va);
>   else
> Index: tmpfs_vnops.c
> ===
> RCS file: /cvs/src/sys/tmpfs/tmpfs_vnops.c,v
> retrieving revision 1.23
> diff -u -p -u -r1.23 tmpfs_vnops.c
> --- tmpfs_vnops.c 8 Dec 2015 15:26:25 -   1.23
> +++ tmpfs_vnops.c 17 Jan 2016 12:06:12 -
> @@ -1017,8 +1017,8 @@ tmpfs_readlink(void *v)
>   KASSERT(vp->v_type == VLNK);
>  
>   node = VP_TO_TMPFS_NODE(vp);
> - error = uiomovei(node->tn_spec.tn_lnk.tn_link,
> - MIN(node->tn_size, uio->uio_resid), uio);
> + error = uiomove(node->tn_spec.tn_lnk.tn_link,
> + MIN((size_t)node->tn_size, uio->uio_resid), uio);
>   tmpfs_update(node, TMPFS_NODE_ACCESSED);
>  
>   return error;



Re: ntfs uiomove() conversion

2016-02-03 Thread Stefan Kempf
Martin Natano wrote:
> On Tue, Feb 02, 2016 at 07:30:08PM +0100, Stefan Kempf wrote:
> > Looks good. I agree with changing left to size_t. One small remark
> > though: size_t is defined as unsigned long. Do the DPRINTFs that print
> > the value of left have to be changed to use %zu in the format string?
> Said DDPRINTFs are in functions not touched by the diff, where 'left' is
> a cn_t. However, this got me thinking: The current code is a wild mix of
> off_t, cn_t and size_t variables for 'left' and the chunk size values
> ('tocopy', 'toread' and 'towrite').

Oh, I missed that, sorry.
 
> Please see the diff below. In addition to the uiomove() change, it also
> unifies the code in ntfs_subr.c to consistently make use of the size_t
> type for aforementioned variables. Note, that the upper bound is a
> variable called 'rsize' (a size_t) in all those cases.

Using the same type for those variables consistently sounds like a
good idea to me. Thanks for updating the diff.
 
> natano
> 
> 
> Index: ntfs/ntfs_subr.c
> ===
> RCS file: /cvs/src/sys/ntfs/ntfs_subr.c,v
> retrieving revision 1.44
> diff -u -p -u -r1.44 ntfs_subr.c
> --- ntfs/ntfs_subr.c  14 Mar 2015 03:38:52 -  1.44
> +++ ntfs/ntfs_subr.c  2 Feb 2016 20:09:53 -
> @@ -1340,7 +1340,8 @@ ntfs_writeattr_plain(struct ntfsmount *n
>  {
>   size_t  init;
>   int error = 0;
> - off_t   off = roff, left = rsize, towrite;
> + off_t   off = roff;
> + size_t  left = rsize, towrite;
>   caddr_t data = rdata;
>   struct ntvattr *vap;
>   *initp = 0;
> @@ -1351,7 +1352,7 @@ ntfs_writeattr_plain(struct ntfsmount *n
>   if (error)
>   return (error);
>   towrite = MIN(left, ntfs_cntob(vap->va_vcnend + 1) - off);
> - DDPRINTF("ntfs_writeattr_plain: o: %lld, s: %lld "
> + DDPRINTF("ntfs_writeattr_plain: o: %lld, s: %zu "
>   "(%llu - %llu)\n", off, towrite,
>   vap->va_vcnstart, vap->va_vcnend);
>   error = ntfs_writentvattr_plain(ntmp, ip, vap,
> @@ -1359,11 +1360,9 @@ ntfs_writeattr_plain(struct ntfsmount *n
>towrite, data, , uio);
>   if (error) {
>   DPRINTF("ntfs_writeattr_plain: ntfs_writentvattr_plain "
> - "failed: o: %d, s: %d\n",
> - (u_int32_t)off, (u_int32_t)towrite);
> - DPRINTF("ntfs_writeattr_plain: attrib: %d - %d\n",
> - (u_int32_t)vap->va_vcnstart,
> - (u_int32_t)vap->va_vcnend);
> + "failed: o: %lld, s: %zu\n", off, towrite);
> + DPRINTF("ntfs_writeattr_plain: attrib: %llu - %llu\n",
> + vap->va_vcnstart, vap->va_vcnend);
>   ntfs_ntvattrrele(vap);
>   break;
>   }
> @@ -1390,10 +1389,10 @@ ntfs_writentvattr_plain(struct ntfsmount
>   int error = 0;
>   off_t   off;
>   int cnt;
> - cn_tccn, ccl, cn, left, cl;
> + cn_tccn, ccl, cn, cl;
>   caddr_t data = rdata;
>   struct buf *bp;
> - size_t  tocopy;
> + size_t  left, tocopy;
>  
>   *initp = 0;
>  
> @@ -1414,7 +1413,7 @@ ntfs_writentvattr_plain(struct ntfsmount
>   ccn = vap->va_vruncn[cnt];
>   ccl = vap->va_vruncl[cnt];
>  
> - DDPRINTF("ntfs_writentvattr_plain: left %llu, cn: 0x%llx, "
> + DDPRINTF("ntfs_writentvattr_plain: left %zu, cn: 0x%llx, "
>   "cl: %llu, off: %lld\n", left, ccn, ccl, off);
>  
>   if (ntfs_cntob(ccl) < off) {
> @@ -1440,7 +1439,7 @@ ntfs_writentvattr_plain(struct ntfsmount
>   cl = ntfs_btocl(tocopy + off);
>   KASSERT(cl == 1 && tocopy <= ntfs_cntob(1));
>   DDPRINTF("ntfs_writentvattr_plain: write: cn: 0x%llx "
> - "cl: %llu, off: %lld len: %llu, left: %llu\n",
> + "cl: %llu, off: %lld len: %zu, left: %zu\n",
>   cn, cl, off, tocopy, left);
>   if ((off == 0) && (tocopy == ntfs_cntob(cl)))
>   {
> @@ -1456,7 +1455,7 @@ ntfs_writentvattr_plain(struct ntfsmount
>  

Re: ntfs uiomove() conversion

2016-02-02 Thread Stefan Kempf
Martin Natano wrote:
> Below the conversion to uiomove() for ntfs. In the first three hunks the
> size passed to uiomove(i) already was of type size_t. I also converted
> the 'left' variable in ntfs_readattr() to size_t, because it tracks the
> remainder of 'rsize', which also is size_t.

Looks good. I agree with changing left to size_t. One small remark
though: size_t is defined as unsigned long. Do the DPRINTFs that print
the value of left have to be changed to use %zu in the format string?

Not sure whether the compiler warns when you specify an unsigned long
long via %llu, but pass in a long as argument.
 
> Index: ntfs/ntfs_subr.c
> ===
> RCS file: /cvs/src/sys/ntfs/ntfs_subr.c,v
> retrieving revision 1.44
> diff -u -p -u -r1.44 ntfs_subr.c
> --- ntfs/ntfs_subr.c  14 Mar 2015 03:38:52 -  1.44
> +++ ntfs/ntfs_subr.c  31 Jan 2016 09:44:42 -
> @@ -1456,7 +1456,7 @@ ntfs_writentvattr_plain(struct ntfsmount
>   }
>   }
>   if (uio) {
> - error = uiomovei(bp->b_data + off, tocopy, uio);
> + error = uiomove(bp->b_data + off, tocopy, uio);
>   if (error != 0)
>   break;
>   } else
> @@ -1554,7 +1554,7 @@ ntfs_readntvattr_plain(struct ntfsmount 
>   return (error);
>   }
>   if (uio) {
> - error = uiomovei(bp->b_data + 
> off,
> + error = uiomove(bp->b_data + 
> off,
>   tocopy, uio);
>   if (error != 0)
>   break;
> @@ -1600,7 +1600,7 @@ ntfs_readntvattr_plain(struct ntfsmount 
>   } else {
>   DDPRINTF("ntfs_readnvattr_plain: data is in mft record\n");
>   if (uio) 
> - error = uiomovei(vap->va_datap + roff, rsize, uio);
> + error = uiomove(vap->va_datap + roff, rsize, uio);
>   else
>   memcpy(rdata, vap->va_datap + roff, rsize);
>   *initp += rsize;
> @@ -1684,9 +1684,10 @@ ntfs_readattr(struct ntfsmount *ntmp, st
>   if (vap->va_compression && vap->va_compressalg) {
>   u_int8_t   *cup;
>   u_int8_t   *uup;
> - off_t   off = roff, left = rsize, tocopy;
> + off_t   off = roff;
>   caddr_t data = rdata;
>   cn_tcn;
> + size_t  left = rsize, tocopy;
>  
>   DDPRINTF("ntfs_ntreadattr: compression: %u\n",
>   vap->va_compressalg);
> @@ -1711,7 +1712,7 @@ ntfs_readattr(struct ntfsmount *ntmp, st
>  
>   if (init == ntfs_cntob(NTFS_COMPUNIT_CL)) {
>   if (uio)
> - error = uiomovei(cup + off, tocopy, 
> uio);
> + error = uiomove(cup + off, tocopy, uio);
>   else
>   memcpy(data, cup + off, tocopy);
>   } else if (init == 0) {
> @@ -1730,7 +1731,7 @@ ntfs_readattr(struct ntfsmount *ntmp, st
>   if (error)
>   break;
>   if (uio)
> - error = uiomovei(uup + off, tocopy, 
> uio);
> + error = uiomove(uup + off, tocopy, uio);
>   else
>   memcpy(data, uup + off, tocopy);
>   }
> 
> cheers,
> natano
> 



Re: udf uiomove() conversion

2016-01-26 Thread Stefan Kempf
Martin Natano wrote:
> Below the conversion to uiomove() for isofs/udf/. Note that converting
> size to size_t is not possible in udf_read(), as udf_readatoffset()
> requires a pointer to an integer variable. Changing that would cause a
> lot of code churn, so i chose to truncate uio_resid to INT_MAX instead.
> udf_readatoffset() wouldn't transfer more than MAXBSIZE bytes anyway.

Looks good. If we have to trust udf_readatoffset already that it does
reasonable things with size, then passing MAXBSIZE to the ulmin would
also be an option.

But I'm fine with the diff as-is also.
 
> Index: udf_vnops.c
> ===
> RCS file: /cvs/src/sys/isofs/udf/udf_vnops.c,v
> retrieving revision 1.61
> diff -u -p -u -r1.61 udf_vnops.c
> --- udf_vnops.c   23 Sep 2015 15:37:26 -  1.61
> +++ udf_vnops.c   20 Jan 2016 08:54:21 -
> @@ -445,13 +445,12 @@ udf_read(void *v)
>  
>   while (uio->uio_offset < fsize && uio->uio_resid > 0) {
>   offset = uio->uio_offset;
> - if (uio->uio_resid + offset <= fsize)
> - size = uio->uio_resid;
> - else
> + size = ulmin(uio->uio_resid, INT_MAX);
> + if (size > fsize - offset)
>   size = fsize - offset;
>   error = udf_readatoffset(up, , offset, , );
>   if (error == 0)
> - error = uiomovei(data, size, uio);
> + error = uiomove(data, (size_t)size, uio);
>   if (bp != NULL) {
>   brelse(bp);
>   bp = NULL;
> @@ -543,7 +542,7 @@ struct udf_uiodir {
>  static int
>  udf_uiodir(struct udf_uiodir *uiodir, struct uio *uio, long off)
>  {
> - int de_size = DIRENT_SIZE(uiodir->dirent);
> + size_t de_size = DIRENT_SIZE(uiodir->dirent);
>  
>   if (uio->uio_resid < de_size) {
>   uiodir->eofflag = 0;
> @@ -552,7 +551,7 @@ udf_uiodir(struct udf_uiodir *uiodir, st
>   uiodir->dirent->d_off = off;
>   uiodir->dirent->d_reclen = de_size;
>  
> - return (uiomovei(uiodir->dirent, de_size, uio));
> + return (uiomove(uiodir->dirent, de_size, uio));
>  }
>  
>  static struct udf_dirstream *
> 
> cheers,
> natano
> 



Re: tty uiomove() conversion

2016-01-23 Thread Stefan Kempf
Martin Natano wrote:
> Below the conversion from uiomovei() to uiomove() for kern/tty.c and
> kern/tty_pty.c. 'cc' consistently contains small, non-negative integer
> values, so leaving the type as int should be ok. It could as well be
> changed to size_t, but I don't see a benefit in doing so for that
> particular case, except for a lot of unnecessary code churn.

Looks good. It's easy to see that cc is always > 0 when passed to
uiomove() here.
 
> Index: kern/tty.c
> ===
> RCS file: /cvs/src/sys/kern/tty.c,v
> retrieving revision 1.127
> diff -u -p -u -r1.127 tty.c
> --- kern/tty.c5 Dec 2015 10:11:53 -   1.127
> +++ kern/tty.c13 Jan 2016 21:22:22 -
> @@ -1782,7 +1782,7 @@ loop:
>   if (cc == 0) {
>   cc = MIN(uio->uio_resid, OBUFSIZ);
>   cp = obuf;
> - error = uiomovei(cp, cc, uio);
> + error = uiomove(cp, cc, uio);
>   if (error) {
>   cc = 0;
>   break;
> Index: kern/tty_pty.c
> ===
> RCS file: /cvs/src/sys/kern/tty_pty.c,v
> retrieving revision 1.74
> diff -u -p -u -r1.74 tty_pty.c
> --- kern/tty_pty.c5 Dec 2015 10:11:53 -   1.74
> +++ kern/tty_pty.c13 Jan 2016 21:22:22 -
> @@ -461,7 +461,7 @@ ptcread(dev_t dev, struct uio *uio, int 
>   if (pti->pt_send & TIOCPKT_IOCTL) {
>   cc = MIN(uio->uio_resid,
>   sizeof(tp->t_termios));
> - error = uiomovei(>t_termios, cc, 
> uio);
> + error = uiomove(>t_termios, cc, 
> uio);
>   if (error)
>   return (error);
>   }
> @@ -496,7 +496,7 @@ ptcread(dev_t dev, struct uio *uio, int 
>   bufcc = cc;
>   if (cc <= 0)
>   break;
> - error = uiomovei(buf, cc, uio);
> + error = uiomove(buf, cc, uio);
>   }
>   ttwakeupwr(tp);
>   if (bufcc)
> @@ -529,7 +529,7 @@ again:
>   if (cc > bufcc)
>   bufcc = cc;
>   cp = buf;
> - error = uiomovei(cp, cc, uio);
> + error = uiomove(cp, cc, uio);
>   if (error)
>   goto done;
>   /* check again for safety */
> @@ -553,7 +553,7 @@ again:
>   if (cc > bufcc)
>   bufcc = cc;
>   cp = buf;
> - error = uiomovei(cp, cc, uio);
> + error = uiomove(cp, cc, uio);
>   if (error)
>   goto done;
>   /* check again for safety */
> 
> cheers,
> natano
> 



Re: msdosfs uiomove() conversion

2016-01-23 Thread Stefan Kempf
Martin Natano wrote:
> Below the uiomove() conversion for msdosfs. This diff prevents
> truncation of uio_resid in both msdosfs_read() and msdosfs_write().

Yes. That's similar to the cd9660 diff.
 
> Index: msdosfs/msdosfs_vnops.c
> ===
> RCS file: /cvs/src/sys/msdosfs/msdosfs_vnops.c,v
> retrieving revision 1.105
> diff -u -p -u -r1.105 msdosfs_vnops.c
> --- msdosfs/msdosfs_vnops.c   13 Jan 2016 10:00:55 -  1.105
> +++ msdosfs/msdosfs_vnops.c   21 Jan 2016 15:52:04 -
> @@ -516,10 +516,9 @@ msdosfs_read(void *v)
>   struct msdosfsmount *pmp = dep->de_pmp;
>   struct uio *uio = ap->a_uio;
>   int isadir, error = 0;
> - uint32_t n, diff, size;
> + uint32_t n, diff, size, on;
>   struct buf *bp;
>   daddr_t cn;
> - long on;
>  
>   /*
>* If they didn't ask for any data, then we are done.
> @@ -537,7 +536,7 @@ msdosfs_read(void *v)
>   cn = de_cluster(pmp, uio->uio_offset);
>   size = pmp->pm_bpcluster;
>   on = uio->uio_offset & pmp->pm_crbomask;
> - n = min((uint32_t) (pmp->pm_bpcluster - on), uio->uio_resid);
> + n = ulmin(pmp->pm_bpcluster - on, uio->uio_resid);
>  
>   /*
>* de_FileSize is uint32_t, and we know that uio_offset <
> @@ -569,7 +568,7 @@ msdosfs_read(void *v)
>   brelse(bp);
>   return (error);
>   }
> - error = uiomovei(bp->b_data + on, (int) n, uio);
> + error = uiomove(bp->b_data + on, n, uio);
>   brelse(bp);
>   } while (error == 0 && uio->uio_resid > 0 && n != 0);
>   if (!isadir && !(vp->v_mount->mnt_flag & MNT_NOATIME))
> @@ -584,9 +583,8 @@ int
>  msdosfs_write(void *v)
>  {
>   struct vop_write_args *ap = v;
> - int n;
> - int croffset;
> - int resid;
> + uint32_t n, croffset;
> + size_t resid;
>   ssize_t overrun;
>   int extended = 0;
>   uint32_t osize;
> @@ -715,7 +713,7 @@ msdosfs_write(void *v)
>   }
>   }
>  
> - n = min(uio->uio_resid, pmp->pm_bpcluster - croffset);
> + n = ulmin(uio->uio_resid, pmp->pm_bpcluster - croffset);
>   if (uio->uio_offset + n > dep->de_FileSize) {
>   dep->de_FileSize = uio->uio_offset + n;
>   uvm_vnp_setsize(vp, dep->de_FileSize);
> @@ -729,7 +727,7 @@ msdosfs_write(void *v)
>   /*
>* Copy the data from user space into the buf header.
>*/
> - error = uiomovei(bp->b_data + croffset, n, uio);
> + error = uiomove(bp->b_data + croffset, n, uio);
>  
>   /*
>* If they want this synchronous then write it and wait for
> @@ -1554,7 +1552,7 @@ msdosfs_readdir(void *v)
>   sizeof(struct direntry);
>   if (uio->uio_resid < dirbuf.d_reclen)
>   goto out;
> - error = uiomovei(, dirbuf.d_reclen, uio);
> + error = uiomove(, dirbuf.d_reclen, uio);
>   if (error)
>   goto out;
>   offset = dirbuf.d_off;
> @@ -1682,7 +1680,7 @@ msdosfs_readdir(void *v)
>   goto out;
>   }
>   wlast = -1;
> - error = uiomovei(, dirbuf.d_reclen, uio);
> + error = uiomove(, dirbuf.d_reclen, uio);
>   if (error) {
>   brelse(bp);
>   goto out;
> 
> cheers,
> natano
> 



Use uiomove() in if_tun and if_pppx

2016-01-22 Thread Stefan Kempf
The following diff prevents integer truncation of uio_resid
by using ulmin() instead of min() and calls uiomove() instead
of the legacy uiomove().

That's straightforward because the m_len mbuf field is unsigned.
mlen can be turned to a size_t because it's set to MLEN or MCLBYTES,
which is > 0.

I plan to commit this in a few days unless there are objections.

Index: if_pppx.c
===
RCS file: /cvs/src/sys/net/if_pppx.c,v
retrieving revision 1.49
diff -u -p -r1.49 if_pppx.c
--- if_pppx.c   14 Jan 2016 09:20:31 -  1.49
+++ if_pppx.c   22 Jan 2016 18:23:04 -
@@ -273,7 +273,8 @@ pppxread(dev_t dev, struct uio *uio, int
struct pppx_dev *pxd = pppx_dev2pxd(dev);
struct mbuf *m, *m0;
int error = 0;
-   int len, s;
+   int s;
+   size_t len;
 
if (!pxd)
return (ENXIO);
@@ -292,9 +293,9 @@ pppxread(dev_t dev, struct uio *uio, int
}
 
while (m0 != NULL && uio->uio_resid > 0 && error == 0) {
-   len = min(uio->uio_resid, m0->m_len);
+   len = ulmin(uio->uio_resid, m0->m_len);
if (len != 0)
-   error = uiomovei(mtod(m0, caddr_t), len, uio);
+   error = uiomove(mtod(m0, caddr_t), len, uio);
m = m_free(m0);
m0 = m;
}
@@ -313,8 +314,9 @@ pppxwrite(dev_t dev, struct uio *uio, in
uint32_t proto;
struct mbuf *top, **mp, *m;
struct niqueue *ifq;
-   int tlen, mlen;
+   int tlen;
int error = 0;
+   size_t mlen;
 #if NBPFILTER > 0
int s;
 #endif
@@ -342,8 +344,8 @@ pppxwrite(dev_t dev, struct uio *uio, in
mp = 
 
while (error == 0 && uio->uio_resid > 0) {
-   m->m_len = min(mlen, uio->uio_resid);
-   error = uiomovei(mtod (m, caddr_t), m->m_len, uio);
+   m->m_len = ulmin(mlen, uio->uio_resid);
+   error = uiomove(mtod (m, caddr_t), m->m_len, uio);
*mp = m;
mp = >m_next;
if (error == 0 && uio->uio_resid > 0) {
Index: if_tun.c
===
RCS file: /cvs/src/sys/net/if_tun.c,v
retrieving revision 1.165
diff -u -p -r1.165 if_tun.c
--- if_tun.c7 Jan 2016 05:31:17 -   1.165
+++ if_tun.c22 Jan 2016 18:23:05 -
@@ -764,7 +764,8 @@ tun_dev_read(struct tun_softc *tp, struc
struct ifnet*ifp = >tun_if;
struct mbuf *m, *m0;
unsigned int ifidx;
-   int  error = 0, len, s;
+   int  error = 0, s;
+   size_t   len;
 
if ((tp->tun_flags & TUN_READY) != TUN_READY)
return (EHOSTDOWN);
@@ -825,9 +826,9 @@ tun_dev_read(struct tun_softc *tp, struc
}
 
while (m0 != NULL && uio->uio_resid > 0 && error == 0) {
-   len = min(uio->uio_resid, m0->m_len);
+   len = ulmin(uio->uio_resid, m0->m_len);
if (len != 0)
-   error = uiomovei(mtod(m0, caddr_t), len, uio);
+   error = uiomove(mtod(m0, caddr_t), len, uio);
m = m_free(m0);
m0 = m;
}
@@ -872,7 +873,8 @@ tun_dev_write(struct tun_softc *tp, stru
struct niqueue  *ifq;
u_int32_t   *th;
struct mbuf *top, **mp, *m;
-   int  error=0, tlen, mlen;
+   int error = 0, tlen;
+   size_t  mlen;
 #if NBPFILTER > 0
int  s;
 #endif
@@ -911,8 +913,8 @@ tun_dev_write(struct tun_softc *tp, stru
m->m_data += ETHER_ALIGN;
}
while (error == 0 && uio->uio_resid > 0) {
-   m->m_len = min(mlen, uio->uio_resid);
-   error = uiomovei(mtod (m, caddr_t), m->m_len, uio);
+   m->m_len = ulmin(mlen, uio->uio_resid);
+   error = uiomove(mtod (m, caddr_t), m->m_len, uio);
*mp = m;
mp = >m_next;
if (error == 0 && uio->uio_resid > 0) {



Re: dev/video.c uiomove() conversion

2016-01-22 Thread Stefan Kempf
Martin Natano wrote:
> Below is a diff to convert videoread() to use uiomove() instead of
> uiomovei(). Note, that passing sc_fsize to ulmin() is not problematic,
> even though it is an int, because the value is always positive.

Yes, sc_fsize *should* always be positive. I had to convince myself
though :-) uvideo(4) sets sc_fsize. There are checks that make
sure that it's not larger than an allocated buffer. And sc_fsize
is used as length argument in a bcopy() in uvideo(4) also. If it was
somehow negative, the bcopy() would cause problems before we got to
the uiomove call below.

So I think this diff is ok.
 
> cheers,
> natano
> 
> Index: dev/video.c
> ===
> RCS file: /cvs/src/sys/dev/video.c,v
> retrieving revision 1.37
> diff -u -p -u -r1.37 video.c
> --- dev/video.c   29 Aug 2015 20:51:46 -  1.37
> +++ dev/video.c   1 Jan 2016 14:31:24 -
> @@ -136,7 +136,8 @@ int
>  videoread(dev_t dev, struct uio *uio, int ioflag)
>  {
>   struct video_softc *sc;
> - int unit, error, size;
> + int unit, error;
> + size_t size;
>  
>   unit = VIDEOUNIT(dev);
>   if (unit >= video_cd.cd_ndevs ||
> @@ -169,16 +170,13 @@ videoread(dev_t dev, struct uio *uio, in
>   }
>  
>   /* move no more than 1 frame to userland, as per specification */
> - if (sc->sc_fsize < uio->uio_resid)
> - size = sc->sc_fsize;
> - else
> - size = uio->uio_resid;
> - error = uiomovei(sc->sc_fbuffer, size, uio);
> + size = ulmin(uio->uio_resid, sc->sc_fsize);
> + error = uiomove(sc->sc_fbuffer, size, uio);
>   sc->sc_frames_ready--;
>   if (error)
>   return (error);
>  
> - DPRINTF(("uiomove successfully done (%d bytes)\n", size));
> + DPRINTF(("uiomove successfully done (%zu bytes)\n", size));
>  
>   return (0);
>  }
> 



Re: 0 is UIO_USERSPACE

2016-01-19 Thread Stefan Kempf
Martin Natano wrote:
> 0 is the same as UIO_USERSPACE, but more readable. No binary change.

Makes sense. The entire tree uses the symbolic enum values.
These are the only places that use 0.
 
> Index: ./arch/octeon/dev/amdcf.c
> ===
> RCS file: /cvs/src/sys/arch/octeon/dev/amdcf.c,v
> retrieving revision 1.1
> diff -u -p -u -r1.1 amdcf.c
> --- ./arch/octeon/dev/amdcf.c 20 Jul 2015 19:44:32 -  1.1
> +++ ./arch/octeon/dev/amdcf.c 9 Jan 2016 17:56:04 -
> @@ -548,7 +548,7 @@ amdcfioctl(dev_t dev, u_long xfer, caddr
>   auio.uio_iov = 
>   auio.uio_iovcnt = 1;
>   auio.uio_resid = fop->df_count;
> - auio.uio_segflg = 0;
> + auio.uio_segflg = UIO_USERSPACE;
>   auio.uio_offset =
>   fop->df_startblk * sc->sc_dk.dk_label->d_secsize;
>   auio.uio_procp = p;
> Index: ./arch/octeon/dev/octcf.c
> ===
> RCS file: /cvs/src/sys/arch/octeon/dev/octcf.c,v
> retrieving revision 1.27
> diff -u -p -u -r1.27 octcf.c
> --- ./arch/octeon/dev/octcf.c 20 Jul 2015 01:38:31 -  1.27
> +++ ./arch/octeon/dev/octcf.c 9 Jan 2016 17:56:09 -
> @@ -616,7 +616,7 @@ octcfioctl(dev_t dev, u_long xfer, caddr
>   auio.uio_iov = 
>   auio.uio_iovcnt = 1;
>   auio.uio_resid = fop->df_count;
> - auio.uio_segflg = 0;
> + auio.uio_segflg = UIO_USERSPACE;
>   auio.uio_offset =
>   fop->df_startblk * wd->sc_dk.dk_label->d_secsize;
>   auio.uio_procp = p;
> Index: ./dev/ata/wd.c
> ===
> RCS file: /cvs/src/sys/dev/ata/wd.c,v
> retrieving revision 1.119
> diff -u -p -u -r1.119 wd.c
> --- ./dev/ata/wd.c26 Aug 2015 22:29:39 -  1.119
> +++ ./dev/ata/wd.c9 Jan 2016 17:56:10 -
> @@ -836,7 +836,7 @@ wdioctl(dev_t dev, u_long xfer, caddr_t 
>   auio.uio_iov = 
>   auio.uio_iovcnt = 1;
>   auio.uio_resid = fop->df_count;
> - auio.uio_segflg = 0;
> + auio.uio_segflg = UIO_USERSPACE;
>   auio.uio_offset =
>   fop->df_startblk * wd->sc_dk.dk_label->d_secsize;
>   auio.uio_procp = p;
> 
> cheers,
> natano
> 



Re: cd9660 uiomove() conversion

2016-01-18 Thread Stefan Kempf
Diff reads good. ok?

Some thoughts (mostly for myself) inline.

Martin Natano wrote:
> Below the uiomove conversion for isofs/cd9660/cd9660_vnops.c.
> 
> cheers,
> natano
> 
> Index: isofs/cd9660/cd9660_vnops.c
> ===
> RCS file: /cvs/src/sys/isofs/cd9660/cd9660_vnops.c,v
> retrieving revision 1.73
> diff -u -p -r1.73 cd9660_vnops.c
> --- isofs/cd9660/cd9660_vnops.c   11 Dec 2015 11:25:55 -  1.73
> +++ isofs/cd9660/cd9660_vnops.c   1 Jan 2016 17:45:50 -
> @@ -227,7 +227,8 @@ cd9660_read(void *v)
>   daddr_t lbn, rablock;
>   off_t diff;
>   int error = 0;
> - long size, n, on;
> + long size, on;
> + size_t n;
>  
>   if (uio->uio_resid == 0)
>   return (0);
> @@ -240,8 +241,7 @@ cd9660_read(void *v)
>  
>   lbn = lblkno(imp, uio->uio_offset);
>   on = blkoff(imp, uio->uio_offset);
> - n = min((u_int)(imp->logical_block_size - on),
> - uio->uio_resid);
> + n = ulmin(imp->logical_block_size - on, uio->uio_resid);

The subtraction can't overflow, because blkoff is basically a
uio->uio_offset % imp->logical_block_size. ulmin() protects against
truncation. The result is always positive and making n size_t is
straightforward.

>   diff = (off_t)ip->i_size - uio->uio_offset;
>   if (diff <= 0)
>   return (0);
> @@ -270,13 +270,13 @@ cd9660_read(void *v)
>   } else
>   error = bread(vp, lbn, size, );
>   ci->ci_lastr = lbn;
> - n = min(n, size - bp->b_resid);
> + n = ulmin(n, size - bp->b_resid);

bp->b_resid is supposed to be <= size because it's the
remaining I/O to get the whole buf filled with size bytes.
So bp->b_resid should be initialized with size initially
and then only decrease.

>   if (error) {
>   brelse(bp);
>   return (error);
>   }
>  
> - error = uiomovei(bp->b_data + on, (int)n, uio);
> + error = uiomove(bp->b_data + on, n, uio);

Straightforward with n being a size_t.
  
>   brelse(bp);
>   } while (error == 0 && uio->uio_resid > 0 && n != 0);
> @@ -344,7 +344,7 @@ iso_uiodir(idp,dp,off)
>   }
>  
>   dp->d_off = off;
> - if ((error = uiomovei((caddr_t)dp, dp->d_reclen, idp->uio)) != 0)
> + if ((error = uiomove(dp, dp->d_reclen, idp->uio)) != 0)

Straightforward. dp->d_reclen is unsigned.

>   return (error);
>   idp->uio_off = off;
>   return (0);
> @@ -657,7 +657,7 @@ cd9660_readlink(void *v)
>*/
>   if (uio->uio_segflg != UIO_SYSSPACE ||
>   uio->uio_iov->iov_len < MAXPATHLEN) {
> - error = uiomovei(symname, symlen, uio);
> + error = uiomove(symname, symlen, uio);

Straightforward, symlen is unsigned. Also did some checking that computation
of symlen does not wrap around also.

>   pool_put(_pool, symname);
>   return (error);
>   }
> 



Re: ppp_tty uiomove() conversion

2016-01-18 Thread Stefan Kempf
Martin Natano wrote:
> Below the uiomove() conversion for net/ppp_tty.c. M_TRAILINGSPACE()
> returns int, but the result can't be negative, so using u_int for the
> return value should be fine.

Looks good. m->m_len is unsigned and yes, given that the mbuf fields
aren't corrupted M_TRAILINGSPACE is >= 0.

ok?
 
> Index: net/ppp_tty.c
> ===
> RCS file: /cvs/src/sys/net/ppp_tty.c,v
> retrieving revision 1.41
> diff -u -p -u -r1.41 ppp_tty.c
> --- net/ppp_tty.c 21 Dec 2015 21:49:02 -  1.41
> +++ net/ppp_tty.c 13 Jan 2016 19:44:53 -
> @@ -321,7 +321,7 @@ pppread(struct tty *tp, struct uio *uio,
>  splx(s);
>  
>  for (m = m0; m && uio->uio_resid; m = m->m_next)
> - if ((error = uiomovei(mtod(m, u_char *), m->m_len, uio)) != 0)
> + if ((error = uiomove(mtod(m, u_char *), m->m_len, uio)) != 0)
>   break;
>  m_freem(m0);
>  return (error);
> @@ -336,7 +336,8 @@ pppwrite(struct tty *tp, struct uio *uio
>  struct ppp_softc *sc = (struct ppp_softc *)tp->t_sc;
>  struct mbuf *m, *m0, **mp;
>  struct sockaddr dst;
> -int len, error;
> +u_int len;
> +int error;
>  
>  if ((tp->t_state & TS_CARR_ON) == 0 && (tp->t_cflag & CLOCAL) == 0)
>   return 0;   /* wrote 0 bytes */
> @@ -361,7 +362,7 @@ pppwrite(struct tty *tp, struct uio *uio
>   len = M_TRAILINGSPACE(m);
>   if (len > uio->uio_resid)
>   len = uio->uio_resid;
> - if ((error = uiomovei(mtod(m, u_char *), len, uio)) != 0) {
> + if ((error = uiomove(mtod(m, u_char *), len, uio)) != 0) {
>   m_freem(m0);
>   return (error);
>   }
> 
> cheers,
> natano
> 



Re: sys_pipe.c uiomove() conversion

2016-01-14 Thread Stefan Kempf
pipe_write() has an orig_resid = uio->uio_resid(). So orig_resid better
be a size_t also. Looks good otherwise.

ok with the updated diff below?

Index: kern/sys_pipe.c
===
RCS file: /cvs/src/sys/kern/sys_pipe.c,v
retrieving revision 1.71
diff -u -p -r1.71 sys_pipe.c
--- kern/sys_pipe.c 6 Jan 2016 17:59:30 -   1.71
+++ kern/sys_pipe.c 15 Jan 2016 04:32:25 -
@@ -297,8 +297,7 @@ pipe_read(struct file *fp, off_t *poff, 
 {
struct pipe *rpipe = fp->f_data;
int error;
-   int nread = 0;
-   int size;
+   size_t size, nread = 0;
 
error = pipelock(rpipe);
if (error)
@@ -316,7 +315,7 @@ pipe_read(struct file *fp, off_t *poff, 
size = rpipe->pipe_buffer.cnt;
if (size > uio->uio_resid)
size = uio->uio_resid;
-   error = 
uiomovei(>pipe_buffer.buffer[rpipe->pipe_buffer.out],
+   error = 
uiomove(>pipe_buffer.buffer[rpipe->pipe_buffer.out],
size, uio);
if (error) {
break;
@@ -413,7 +412,7 @@ int
 pipe_write(struct file *fp, off_t *poff, struct uio *uio, struct ucred *cred)
 {
int error = 0;
-   int orig_resid;
+   size_t orig_resid;
struct pipe *wpipe, *rpipe;
 
rpipe = fp->f_data;
@@ -460,7 +459,7 @@ pipe_write(struct file *fp, off_t *poff,
orig_resid = uio->uio_resid;
 
while (uio->uio_resid) {
-   int space;
+   size_t space;
 
 retrywrite:
if (wpipe->pipe_state & PIPE_EOF) {
@@ -476,8 +475,8 @@ retrywrite:
 
if (space > 0) {
if ((error = pipelock(wpipe)) == 0) {
-   int size;   /* Transfer size */
-   int segsize;/* first segment to transfer */
+   size_t size;/* Transfer size */
+   size_t segsize; /* first segment to transfer */
 
/*
 * If a process blocked in uiomove, our
@@ -514,7 +513,7 @@ retrywrite:
 
/* Transfer first segment */
 
-   error = 
uiomovei(>pipe_buffer.buffer[wpipe->pipe_buffer.in], 
+   error = 
uiomove(>pipe_buffer.buffer[wpipe->pipe_buffer.in],
segsize, uio);
 
if (error == 0 && segsize < size) {
@@ -529,7 +528,7 @@ retrywrite:
panic("Expected pipe buffer 
wraparound disappeared");
 #endif
 
-   error = 
uiomovei(>pipe_buffer.buffer[0],
+   error = 
uiomove(>pipe_buffer.buffer[0],
size - segsize, uio);
}
if (error == 0) {

Martin Natano wrote:
> Below the uiomove() conversion for kern/sys_pipe.c. This diff eliminates
> a potential overflow of the nread variable.
> 
> I think it would benefit readability to sprinkle some ulmin()/MIN() into
> that code and wrap lines to <80 chars, but that's probably a matter for
> another diff. Including those changes adds to much noise to this diff.
> 
> Index: kern/sys_pipe.c
> ===
> RCS file: /cvs/src/sys/kern/sys_pipe.c,v
> retrieving revision 1.71
> diff -u -p -u -r1.71 sys_pipe.c
> --- kern/sys_pipe.c   6 Jan 2016 17:59:30 -   1.71
> +++ kern/sys_pipe.c   9 Jan 2016 14:07:38 -
> @@ -297,8 +297,7 @@ pipe_read(struct file *fp, off_t *poff, 
>  {
>   struct pipe *rpipe = fp->f_data;
>   int error;
> - int nread = 0;
> - int size;
> + size_t size, nread = 0;
>  
>   error = pipelock(rpipe);
>   if (error)
> @@ -316,7 +315,7 @@ pipe_read(struct file *fp, off_t *poff, 
>   size = rpipe->pipe_buffer.cnt;
>   if (size > uio->uio_resid)
>   size = uio->uio_resid;
> - error = 
> uiomovei(>pipe_buffer.buffer[rpipe->pipe_buffer.out],
> + error = 
> uiomove(>pipe_buffer.buffer[rpipe->pipe_buffer.out],
>   size, uio);
>   if (error) {
>   break;
> @@ -460,7 +459,7 @@ pipe_write(struct file *fp, off_t *poff,
>   orig_resid = uio->uio_resid;
>  
>   while (uio->uio_resid) {
> - int space;
> + size_t space;
>  
>  retrywrite:
>   if (wpipe->pipe_state & PIPE_EOF) {
> @@ -476,8 +475,8 @@ retrywrite:
>  
>   if (space > 0) {
>   if ((error = 

Re: klog uiomove() conversion

2016-01-12 Thread Stefan Kempf
Martin Natano wrote:
> Below the uiomove() conversion for kern/subr_log.c. msg_buf[rsx] are all
> of type long, but are always positive. This diff prevents truncation of
> uio_resid (and l) due to min() usage.

Makes sense.

ok?
 
> Index: kern/subr_log.c
> ===
> RCS file: /cvs/src/sys/kern/subr_log.c,v
> retrieving revision 1.36
> diff -u -p -u -r1.36 subr_log.c
> --- kern/subr_log.c   7 Jan 2016 12:27:07 -   1.36
> +++ kern/subr_log.c   9 Jan 2016 14:49:27 -
> @@ -180,7 +180,7 @@ int
>  logread(dev_t dev, struct uio *uio, int flag)
>  {
>   struct msgbuf *mbp = msgbufp;
> - long l;
> + size_t l;
>   int s;
>   int error = 0;
>  
> @@ -202,13 +202,14 @@ logread(dev_t dev, struct uio *uio, int 
>   logsoftc.sc_state &= ~LOG_RDWAIT;
>  
>   while (uio->uio_resid > 0) {
> - l = mbp->msg_bufx - mbp->msg_bufr;
> - if (l < 0)
> + if (mbp->msg_bufx >= mbp->msg_bufr)
> + l = mbp->msg_bufx - mbp->msg_bufr;
> + else
>   l = mbp->msg_bufs - mbp->msg_bufr;
> - l = min(l, uio->uio_resid);
> + l = ulmin(l, uio->uio_resid);
>   if (l == 0)
>   break;
> - error = uiomovei(>msg_bufc[mbp->msg_bufr], (int)l, uio);
> + error = uiomove(>msg_bufc[mbp->msg_bufr], l, uio);
>   if (error)
>   break;
>   mbp->msg_bufr += l;
> 
> cheers,
> natano
> 



Re: fix iwn firmware error during init

2016-01-12 Thread Stefan Kempf
Stefan Sperling wrote:
> I've run into an issue where iwn(4) fails to init the hardware.
> 
> Running 'ifconfig iwn0 scan' resulted in:
> 
> setting configuration
> iwn0: fatal firmware error
> firmware error log:
>   error type  = "SYSASSERT" (0x0005)
>   program counter = 0x00022088
>   source line = 0x00A4
>   error data  = 0x00A4
>   branch link = 0x0002225800022258
>   interrupt link  = 0x1532
>   time= 27873
> driver status:
>   tx ring  0: qid=0  cur=0   queued=0  
>   tx ring  1: qid=1  cur=0   queued=0  
>   tx ring  2: qid=2  cur=0   queued=0  
>   tx ring  3: qid=3  cur=0   queued=0  
>   tx ring  4: qid=4  cur=6   queued=0  
>   tx ring  5: qid=5  cur=0   queued=0  
>   tx ring  6: qid=6  cur=0   queued=0  
>   tx ring  7: qid=7  cur=0   queued=0  
>   tx ring  8: qid=8  cur=0   queued=0  
>   tx ring  9: qid=9  cur=0   queued=0  
>   tx ring 10: qid=10 cur=0   queued=0  
>   tx ring 11: qid=11 cur=0   queued=0  
>   tx ring 12: qid=12 cur=0   queued=0  
>   tx ring 13: qid=13 cur=0   queued=0  
>   tx ring 14: qid=14 cur=0   queued=0  
>   tx ring 15: qid=15 cur=0   queued=0  
>   tx ring 16: qid=16 cur=0   queued=0  
>   tx ring 17: qid=17 cur=0   queued=0  
>   tx ring 18: qid=18 cur=0   queued=0  
>   tx ring 19: qid=19 cur=0   queued=0  
>   rx ring: cur=7
>   802.11 state 0
> iwn0: RXON command failed
> iwn0: could not configure device
> iwn0: could not load firmware .data section
> iwn0: could not load firmware
> iwn0: could not initialize hardware
> 
> A debug printf revealed the rxon command channel was set to zero:
> 
>   iwn_config: rxon chan 0 flags 40008000 cck f ofdm ff
> 
> Fix a misplaced curly brace while here...

Cannot comment on the code unfortunately. But with a current kernel
I regularly get the above error on startup since today. Your diff fixes it.
I'd be happy to see a fix for this to go in :-)

> ok?
> 
> Index: if_iwn.c
> ===
> RCS file: /cvs/src/sys/dev/pci/if_iwn.c,v
> retrieving revision 1.153
> diff -u -p -r1.153 if_iwn.c
> --- if_iwn.c  7 Jan 2016 23:08:38 -   1.153
> +++ if_iwn.c  9 Jan 2016 21:12:23 -
> @@ -3429,7 +3429,7 @@ iwn_set_link_quality(struct iwn_softc *s
>   /* Next retry at immediate lower bit-rate. */
>   if (txrate > 0)
>   txrate--;
> - }
> + }
>   }
>  
>   return iwn_cmd(sc, IWN_CMD_LINK_QUALITY, , sizeof linkq, 1);
> @@ -4455,15 +4455,9 @@ iwn_config(struct iwn_softc *sc)
>   IEEE80211_ADDR_COPY(ic->ic_myaddr, LLADDR(ifp->if_sadl));
>   IEEE80211_ADDR_COPY(sc->rxon.myaddr, ic->ic_myaddr);
>   IEEE80211_ADDR_COPY(sc->rxon.wlap, ic->ic_myaddr);
> - sc->rxon.chan = ieee80211_chan2ieee(ic, ic->ic_ibss_chan);
> + sc->rxon.chan = 1;
>   sc->rxon.flags = htole32(IWN_RXON_TSF | IWN_RXON_CTS_TO_SELF);
> - if (IEEE80211_IS_CHAN_2GHZ(ic->ic_ibss_chan)) {
> - sc->rxon.flags |= htole32(IWN_RXON_AUTO | IWN_RXON_24GHZ);
> - if (ic->ic_flags & IEEE80211_F_USEPROT)
> - sc->rxon.flags |= htole32(IWN_RXON_TGG_PROT);
> - DPRINTF(("%s: 2ghz prot 0x%x\n", __func__,
> - le32toh(sc->rxon.flags)));
> - }
> + sc->rxon.flags |= htole32(IWN_RXON_AUTO | IWN_RXON_24GHZ);
>   switch (ic->ic_opmode) {
>   case IEEE80211_M_STA:
>   sc->rxon.mode = IWN_MODE_STA;
> @@ -4489,6 +4483,9 @@ iwn_config(struct iwn_softc *sc)
>   IWN_RXCHAIN_IDLE_COUNT(2);
>   sc->rxon.rxchain = htole16(rxchain);
>   DPRINTF(("setting configuration\n"));
> + DPRINTF(("%s: rxon chan %d flags %x cck %x ofdm %x\n", __func__,
> + sc->rxon.chan, le32toh(sc->rxon.flags), sc->rxon.cck_mask,
> + sc->rxon.ofdm_mask));
>   error = iwn_cmd(sc, IWN_CMD_RXON, >rxon, sc->rxonsz, 0);
>   if (error != 0) {
>   printf("%s: RXON command failed\n", sc->sc_dev.dv_xname);
> 



Re: wsevent.c uiomove() conversion

2016-01-11 Thread Stefan Kempf
Martin Natano wrote:
> Below the uiomove() conversion for dev/wscons/wsevent.c. 'cnt' could
> as well be a size_t, but using u_int makes clear, that it will never
> exceed UINT_MAX, and that 'ev->get = cnt;' won't overflow.

Makes sense to me.

ok?
 
> Index: dev/wscons/wsevent.c
> ===
> RCS file: /cvs/src/sys/dev/wscons/wsevent.c,v
> retrieving revision 1.14
> diff -u -p -u -r1.14 wsevent.c
> --- dev/wscons/wsevent.c  10 Sep 2015 18:14:52 -  1.14
> +++ dev/wscons/wsevent.c  9 Jan 2016 08:35:55 -
> @@ -136,7 +136,9 @@ wsevent_fini(struct wseventvar *ev)
>  int
>  wsevent_read(struct wseventvar *ev, struct uio *uio, int flags)
>  {
> - int s, n, cnt, error;
> + int s, error;
> + u_int cnt;
> + size_t n;
>  
>   /*
>* Make sure we can return at least 1.
> @@ -169,7 +171,7 @@ wsevent_read(struct wseventvar *ev, stru
>   n = howmany(uio->uio_resid, sizeof(struct wscons_event));
>   if (cnt > n)
>   cnt = n;
> - error = uiomovei((caddr_t)>q[ev->get],
> + error = uiomove((caddr_t)>q[ev->get],
>   cnt * sizeof(struct wscons_event), uio);
>   n -= cnt;
>   /*
> @@ -182,7 +184,7 @@ wsevent_read(struct wseventvar *ev, stru
>   return (error);
>   if (cnt > n)
>   cnt = n;
> - error = uiomovei((caddr_t)>q[0],
> + error = uiomove((caddr_t)>q[0],
>   cnt * sizeof(struct wscons_event), uio);
>   ev->get = cnt;
>   return (error);
> 
> cheers,
> natano
> 



Re: ksyms uiomove() conversion

2016-01-10 Thread Stefan Kempf
Martin Natano wrote:
> Below the uiomove() conversion for dev/ksyms.c. 'len' is a size_t.

And len is computed from values >= 0 and from expressions that do not
wrap around.

ok?
 
> Index: dev/ksyms.c
> ===
> RCS file: /cvs/src/sys/dev/ksyms.c,v
> retrieving revision 1.30
> diff -u -p -u -r1.30 ksyms.c
> --- dev/ksyms.c   29 Aug 2015 01:58:39 -  1.30
> +++ dev/ksyms.c   9 Jan 2016 08:16:50 -
> @@ -164,7 +164,7 @@ ksymsread(dev_t dev, struct uio *uio, in
>   if (len > uio->uio_resid)
>   len = uio->uio_resid;
>  
> - if ((error = uiomovei(v, len, uio)) != 0)
> + if ((error = uiomove(v, len, uio)) != 0)
>   return (error);
>   }
>  
> cheers,
> natano
> 



Re: fusefs uiomove() conversion

2016-01-10 Thread Stefan Kempf
looks good to me.

ok?

Martin Natano wrote:
> Below the conversion from uiovmovei() to uiomove() for miscfs/fusefs/.
> All size parameters already are size_t, so the diff is straightforward.
> 
> Index: miscfs/fuse/fuse_device.c
> ===
> RCS file: /cvs/src/sys/miscfs/fuse/fuse_device.c,v
> retrieving revision 1.19
> diff -u -p -u -r1.19 fuse_device.c
> --- miscfs/fuse/fuse_device.c 2 Sep 2015 04:07:11 -   1.19
> +++ miscfs/fuse/fuse_device.c 9 Jan 2016 07:28:04 -
> @@ -496,7 +496,7 @@ fusewrite(dev_t dev, struct uio *uio, in
>   }
>  
>   /* Get the missing datas from the fbuf */
> - error = uiomovei(>FD, uio->uio_resid, uio);
> + error = uiomove(>FD, uio->uio_resid, uio);
>   if (error)
>   return error;
>   fbuf->fb_dat = NULL;
> Index: miscfs/fuse/fuse_vnops.c
> ===
> RCS file: /cvs/src/sys/miscfs/fuse/fuse_vnops.c,v
> retrieving revision 1.25
> diff -u -p -u -r1.25 fuse_vnops.c
> --- miscfs/fuse/fuse_vnops.c  23 Sep 2015 15:37:26 -  1.25
> +++ miscfs/fuse/fuse_vnops.c  9 Jan 2016 07:28:09 -
> @@ -728,7 +728,7 @@ fusefs_readdir(void *v)
>   break;
>   }
>  
> - if ((error = uiomovei(fbuf->fb_dat, fbuf->fb_len, uio))) {
> + if ((error = uiomove(fbuf->fb_dat, fbuf->fb_len, uio))) {
>   fb_delete(fbuf);
>   break;
>   }
> @@ -810,7 +810,7 @@ fusefs_readlink(void *v)
>   return (error);
>   }
>  
> - error = uiomovei(fbuf->fb_dat, fbuf->fb_len, uio);
> + error = uiomove(fbuf->fb_dat, fbuf->fb_len, uio);
>   fb_delete(fbuf);
>  
>   return (error);
> @@ -1058,7 +1058,7 @@ fusefs_read(void *v)
>   if (error)
>   break;
>  
> - error = uiomovei(fbuf->fb_dat, MIN(size, fbuf->fb_len), uio);
> + error = uiomove(fbuf->fb_dat, MIN(size, fbuf->fb_len), uio);
>   if (error)
>   break;
>  
> @@ -1112,7 +1112,7 @@ fusefs_write(void *v)
>   fbuf->fb_io_off = uio->uio_offset;
>   fbuf->fb_io_len = len;
>  
> - if ((error = uiomovei(fbuf->fb_dat, len, uio))) {
> + if ((error = uiomove(fbuf->fb_dat, len, uio))) {
>   printf("fusefs: uio error %i\n", error);
>   break;
>   }
> 
> cheers,
> natano



Re: integer truncation in soreceive()

2016-01-01 Thread Stefan Kempf
Thanks. A similar diff was discussed privately with a few
developers during the last few days and is about to be
committed soon.

Martin Natano wrote:
> Another integer overflow: A recv() call with a size of 2^32 bytes causes
> soreceive() to spin in an endless loop, resulting in a system freeze.
> The diff below prevents this behaviour by establishing an upper bound
> for uio_resid before assigning the value to an integer variable with
> smaller width. Also the 'offset' and 'resid' variables are converted to
> use the correct integer types.
> 
> cheers,
> natano
> 
> Index: kern/uipc_socket.c
> ===
> RCS file: /cvs/src/sys/kern/uipc_socket.c,v
> retrieving revision 1.144
> diff -u -p -u -r1.144 uipc_socket.c
> --- kern/uipc_socket.c5 Dec 2015 10:11:53 -   1.144
> +++ kern/uipc_socket.c31 Dec 2015 21:26:01 -
> @@ -613,13 +613,14 @@ soreceive(struct socket *so, struct mbuf
>  {
>   struct mbuf *m, **mp;
>   struct mbuf *cm;
> - int flags, len, error, s, offset;
> + int flags, len, error, s;
> + u_long offset;
>   struct protosw *pr = so->so_proto;
>   struct mbuf *nextrecord;
>   int moff, type = 0;
>   size_t orig_resid = uio->uio_resid;
>   int uio_error = 0;
> - int resid;
> + size_t resid;
>  
>   mp = mp0;
>   if (paddr)
> @@ -639,8 +640,8 @@ soreceive(struct socket *so, struct mbuf
>   if (error)
>   goto bad;
>   do {
> - error = uiomovei(mtod(m, caddr_t),
> - (int) min(uio->uio_resid, m->m_len), uio);
> + error = uiomove(mtod(m, caddr_t),
> + ulmin(uio->uio_resid, m->m_len), uio);
>   m = m_free(m);
>   } while (uio->uio_resid && error == 0 && m);
>  bad:
> @@ -833,11 +834,9 @@ dontblock:
>   panic("receive 3");
>  #endif
>   so->so_state &= ~SS_RCVATMARK;
> - len = uio->uio_resid;
> + len = ulmin(uio->uio_resid, m->m_len - moff);
>   if (so->so_oobmark && len > so->so_oobmark - offset)
>   len = so->so_oobmark - offset;
> - if (len > m->m_len - moff)
> - len = m->m_len - moff;
>   /*
>* If mp is set, just pass back the mbufs.
>* Otherwise copy them out via the uio, then free.
> 



uvm: remove unused args in uvm_mapent_fork*()

2015-12-11 Thread Stefan Kempf
Index: uvm/uvm_map.c
===
RCS file: /cvs/src/sys/uvm/uvm_map.c,v
retrieving revision 1.204
diff -u -p -r1.204 uvm_map.c
--- uvm/uvm_map.c   14 Nov 2015 14:53:14 -  1.204
+++ uvm/uvm_map.c   11 Dec 2015 18:47:51 -
@@ -183,15 +183,12 @@ intuvm_mapent_bias(struct 
vm_map*, s
 struct vm_map_entry*uvm_mapent_clone(struct vm_map*, vaddr_t, vsize_t,
vsize_t, struct vm_map_entry*,
struct uvm_map_deadq*, int, int);
-struct vm_map_entry*uvm_mapent_forkshared(struct vmspace*, struct vm_map*,
-   struct vm_map*, struct vm_map_entry*,
-   struct uvm_map_deadq*);
-struct vm_map_entry*uvm_mapent_forkcopy(struct vmspace*, struct vm_map*,
-   struct vm_map*, struct vm_map_entry*,
-   struct uvm_map_deadq*);
-struct vm_map_entry*uvm_mapent_forkzero(struct vmspace*, struct vm_map*,
-   struct vm_map*, struct vm_map_entry*,
-   struct uvm_map_deadq*);
+struct vm_map_entry*uvm_mapent_forkshared(struct vm_map*, struct vm_map*,
+   struct vm_map_entry*, struct uvm_map_deadq*);
+struct vm_map_entry*uvm_mapent_forkcopy(struct vm_map*, struct vm_map*,
+   struct vm_map_entry*, struct uvm_map_deadq*);
+struct vm_map_entry*uvm_mapent_forkzero(struct vm_map*,
+   struct vm_map_entry*, struct uvm_map_deadq*);
 
 /*
  * Tree validation.
@@ -3418,8 +3415,7 @@ uvm_mapent_clone(struct vm_map *dstmap, 
  * new entries to share amaps and backing objects.
  */
 struct vm_map_entry *
-uvm_mapent_forkshared(struct vmspace *new_vm, struct vm_map *new_map,
-struct vm_map *old_map,
+uvm_mapent_forkshared(struct vm_map *new_map, struct vm_map *old_map,
 struct vm_map_entry *old_entry, struct uvm_map_deadq *dead)
 {
struct vm_map_entry *new_entry;
@@ -3462,8 +3458,7 @@ uvm_mapent_forkshared(struct vmspace *ne
  * (note that new references are read-only).
  */
 struct vm_map_entry *
-uvm_mapent_forkcopy(struct vmspace *new_vm, struct vm_map *new_map,
-struct vm_map *old_map,
+uvm_mapent_forkcopy(struct vm_map *new_map, struct vm_map *old_map,
 struct vm_map_entry *old_entry, struct uvm_map_deadq *dead)
 {
struct vm_map_entry *new_entry;
@@ -3604,8 +3599,7 @@ uvm_mapent_forkcopy(struct vmspace *new_
  * zero the mapping: the new entry will be zero initialized
  */
 struct vm_map_entry *
-uvm_mapent_forkzero(struct vmspace *new_vm, struct vm_map *new_map,
-struct vm_map *old_map,
+uvm_mapent_forkzero(struct vm_map *new_map,
 struct vm_map_entry *old_entry, struct uvm_map_deadq *dead)
 {
struct vm_map_entry *new_entry;
@@ -3682,16 +3676,16 @@ uvmspace_fork(struct process *pr)
/* Apply inheritance. */
switch (old_entry->inheritance) {
case MAP_INHERIT_SHARE:
-   new_entry = uvm_mapent_forkshared(vm2, new_map,
+   new_entry = uvm_mapent_forkshared(new_map,
old_map, old_entry, );
break;
case MAP_INHERIT_COPY:
-   new_entry = uvm_mapent_forkcopy(vm2, new_map,
+   new_entry = uvm_mapent_forkcopy(new_map,
old_map, old_entry, );
break;
case MAP_INHERIT_ZERO:
-   new_entry = uvm_mapent_forkzero(vm2, new_map,
-   old_map, old_entry, );
+   new_entry = uvm_mapent_forkzero(new_map,
+   old_entry, );
break;
default:
continue;



Re: Linker changes between 5.7 and 5.8

2015-12-04 Thread Stefan Kempf
Tati Chevron wrote:
> This assembled and linked without problems on 5.7-release, but now when
> I try it on 5.8-release, I get an error:
> 
> $ as -o charset.o charset.S
> $ ld -Bstatic charset.o
 
> ld: charset.o: relocation R_X86_64_32S against `a local symbol' can not
> be used when making a shared object; recompile with -fPIC
> charset.o: could not read symbols: Bad value

Try it with ld -Bstatic -nopie charset.o



vmm: run guest without kernel biglock

2015-11-25 Thread Stefan Kempf
Hi,

this diff makes guests runnable without holding the biglock.
If the guest exits because of an interrupt, the interrupt is
handled first before re-grabbing the lock.

This is needed because the TLB shootdown code runs under the biglock
and issues an IPI to other CPUs to invalidate the TLB. Then the TLB
shootdown code spins under the biglock, waiting for the other
CPUs to do their work.

If we try to grab the biglock in vmm(4) before handling the
IPI, vmm(4) will block forever.

Index: sys/arch/amd64/amd64/vmm.c
===
RCS file: /cvs/src/sys/arch/amd64/amd64/vmm.c,v
retrieving revision 1.7
diff -u -p -r1.7 vmm.c
--- sys/arch/amd64/amd64/vmm.c  24 Nov 2015 09:07:09 -  1.7
+++ sys/arch/amd64/amd64/vmm.c  26 Nov 2015 04:28:28 -
@@ -2520,17 +2520,17 @@ vcpu_run_vmx(struct vcpu *vcpu, uint8_t 
invvpid(IA32_VMX_INVVPID_SINGLE_CTX_GLB, );
 
/* Start / resume the VM / VCPU */
-   /* XXX unlock the biglock here */
+   KERNEL_UNLOCK();
ret = vmx_enter_guest(>vc_control_pa,
>vc_gueststate, resume);
-   /* XXX lock the biglock here */
 
-   /* If we exited successfully ... */
-   if (ret == 0) {
+   switch (ret) {
+   case 0: /* If we exited successfully ... */
resume = 1;
vcpu->vc_last_pcpu = ci;
if (vmread(VMCS_GUEST_IA32_RIP,
>vc_gueststate.vg_rip)) {
+   KERNEL_LOCK();
printf("vcpu_run_vmx: cannot read guest rip\n");
ret = EINVAL;
exit_handled = 0;
@@ -2538,6 +2538,7 @@ vcpu_run_vmx(struct vcpu *vcpu, uint8_t 
}
 
if (vmread(VMCS_EXIT_REASON, _reason)) {
+   KERNEL_LOCK();
printf("vcpu_run_vmx: cant read exit reason\n");
ret = EINVAL;
exit_handled = 0;
@@ -2547,9 +2548,24 @@ vcpu_run_vmx(struct vcpu *vcpu, uint8_t 
/*
 * Handle the exit. This will alter "ret" to EAGAIN if
 * the exit handler determines help from vmd is needed.
+*
+* When the guest exited because of an external
+* interrupt, handle it first before grabbing the
+* kernel lock: we might have gotten an IPI that has
+* to do some work that another CPU depends on.
+*
+* Example: CPU 1 grabs the kernel lock, sets some
+* flag to 1, issues an IPI to all other CPUs, and
+* waits for them to clear the flag. If this code
+* grabs the kernel lock here first, it will block
+* forever.
 */
vcpu->vc_gueststate.vg_exit_reason = exit_reason;
+   if (exit_reason != VMX_EXIT_EXTINT)
+   KERNEL_LOCK();
exit_handled = vmx_handle_exit(vcpu, );
+   if (exit_reason == VMX_EXIT_EXTINT)
+   KERNEL_LOCK();
 
/* Check if we should yield - don't hog the cpu */
spc = >ci_schedstate;
@@ -2561,25 +2577,31 @@ vcpu_run_vmx(struct vcpu *vcpu, uint8_t 
}
yield();
}
-   } else if (ret == VMX_FAIL_LAUNCH_INVALID_VMCS) {
+   break;
+   case VMX_FAIL_LAUNCH_INVALID_VMCS:
+   KERNEL_LOCK();
printf("vmx_enter_guest: failed launch with invalid "
"vmcs\n");
ret = EINVAL;
exit_handled = 0;
-   } else if (ret == VMX_FAIL_LAUNCH_VALID_VMCS) {
+   break;
+   case VMX_FAIL_LAUNCH_VALID_VMCS:
+   KERNEL_LOCK();
exit_reason = vcpu->vc_gueststate.vg_exit_reason;
printf("vmx_enter_guest: failed launch with valid "
"vmcs, code=%lld (%s)\n", exit_reason,
vmx_instruction_error_decode(exit_reason));
ret = EINVAL;
exit_handled = 0;
-   } else {
+   break;
+   default:
+   KERNEL_LOCK();
printf("vmx_enter_guest: failed launch for unknown "
"reason\n");