Re: [Qemu-devel] [qemu-s390x] [PATCH v7 05/12] s390-ccw: move auxiliary IPL data to separate location

2018-02-17 Thread Thomas Huth
On 16.02.2018 23:07, Collin L. Walling wrote:
> The s390-ccw firmware needs some information in support of the
> boot process which is not available on the native machine.
> Examples are the netboot firmware load address and now the
> boot menu parameters.
> 
> While storing that data in unused fields of the IPL parameter block
> works, that approach could create problems if the parameter block
> definition should change in the future. Because then a guest could
> overwrite these fields using the set IPLB diagnose.
> 
> In fact the data in question is of more global nature and not really
> tied to an IPL device, so separating it is rather logical.
> 
> This commit introduces a new structure to hold firmware relevant
> IPL parameters set by QEMU. The data is stored at location 204 (dec)
> and can contain up to 7 32-bit words. This area is available to
> programming in the z/Architecture Principles of Operation and
> can thus safely be used by the firmware until the IPL has completed.
> 
> Signed-off-by: Viktor Mihajlovski 
> Signed-off-by: Collin L. Walling 
> ---
[...]
> diff --git a/hw/s390x/ipl.h b/hw/s390x/ipl.h
> index 8a705e0..74469b1 100644
> --- a/hw/s390x/ipl.h
> +++ b/hw/s390x/ipl.h
> @@ -16,8 +16,7 @@
>  #include "cpu.h"
>  
>  struct IplBlockCcw {
> -uint64_t netboot_start_addr;
> -uint8_t  reserved0[77];
> +uint8_t  reserved0[85];
>  uint8_t  ssid;
>  uint16_t devno;
>  uint8_t  vm_flags;
> @@ -59,6 +58,21 @@ typedef struct IplBlockQemuScsi IplBlockQemuScsi;
>  
>  #define DIAG308_FLAGS_LP_VALID 0x80
>  
> +#define QIPL_ADDRESS  0xcc
> +
> +/*
> + * The QEMU IPL Parameters will be stored 32-bit word aligned.
> + * Placement of data fields in this area must account for
> + * their alignment needs.
> + * The entire structure must not be larger than 28 bytes.
> + */
> +struct QemuIplParameters {
> +uint8_t  reserved1[4];
> +uint64_t netboot_start_addr;
> +uint8_t  reserved2[16];
> +} QEMU_PACKED;
> +typedef struct QemuIplParameters QemuIplParameters;
> +
>  union IplParameterBlock {
>  struct {
>  uint32_t len;
> @@ -74,6 +88,7 @@ union IplParameterBlock {
>  IplBlockFcp fcp;
>  IplBlockQemuScsi scsi;
>  };
> +QemuIplParameters qipl;
>  } QEMU_PACKED;
>  struct {
>  uint8_t  reserved1[110];

I still think that the information should *not* be stored within the
IplParameterBlock to avoid that we pass it via DIAG 0x308, too.
If we do it like this, I'm pretty sure that we will look at this code in
a couple of years and wonder whether we can change it again or whether
this is an established interface between the host and the guest. So
please, let's avoid establishing such "hidden" interfaces just out of
current convenience. There must be a better location for this.
Christian, do you have an idea?

 Thomas



Re: [Qemu-devel] [qemu-s390x] [PATCH v7 06/12] s390-ccw: parse and set boot menu options

2018-02-17 Thread Thomas Huth
On 16.02.2018 23:07, Collin L. Walling wrote:
> Set boot menu options for an s390 guest and store them in
> the iplb. These options are set via the QEMU command line
> option:
> 
> -boot menu=on|off[,splash-time=X]
> 
> or via the libvirt domain xml:
> 
> 
>   
> 
> 
> Where X represents some positive integer representing
> milliseconds.
> 
> Any value set for loadparm will override all boot menu options.
> If loadparm=PROMPT, then the menu will be enabled without a
> timeout.
> 
> The absence of any boot options on the command line will flag
> to later use the zipl boot loader values.
> 
> Signed-off-by: Collin L. Walling 
> Reviewed-by: Janosch Frank 
> Reviewed-by: Thomas Huth 

You've managed to add new bugs here. Please drop my Reviewed-by again.

> ---
>  hw/s390x/ipl.c  | 48 
>  hw/s390x/ipl.h  |  9 +++--
>  pc-bios/s390-ccw/iplb.h |  6 --
>  3 files changed, 59 insertions(+), 4 deletions(-)
> 
> diff --git a/hw/s390x/ipl.c b/hw/s390x/ipl.c
> index 31565ce..c8109f5 100644
> --- a/hw/s390x/ipl.c
> +++ b/hw/s390x/ipl.c
> @@ -23,6 +23,9 @@
>  #include "hw/s390x/ebcdic.h"
>  #include "ipl.h"
>  #include "qemu/error-report.h"
> +#include "qemu/config-file.h"
> +#include "qemu/cutils.h"
> +#include "qemu/option.h"
>  
>  #define KERN_IMAGE_START0x01UL
>  #define KERN_PARM_AREA  0x010480UL
> @@ -219,6 +222,50 @@ static Property s390_ipl_properties[] = {
>  DEFINE_PROP_END_OF_LIST(),
>  };
>  
> +static void s390_ipl_set_boot_menu(IplParameterBlock *iplb)
> +{
> +QemuOptsList *plist = qemu_find_opts("boot-opts");
> +QemuOpts *opts = QTAILQ_FIRST(&plist->head);
> +uint8_t *flags;
> +uint32_t *timeout;
> +const char *tmp;
> +unsigned long splash_time = 0;
> +
> +switch (iplb->pbt) {
> +case S390_IPL_TYPE_CCW:
> +case S390_IPL_TYPE_QEMU_SCSI:
> +flags = &iplb->qipl.boot_menu_flags;
> +timeout = &iplb->qipl.boot_menu_timeout;
> +break;
> +default:
> +error_report("boot menu is not supported for this device type.");
> +return;
> +}
> +
> +/* In the absence of -boot menu, use zipl parameters */
> +if (!qemu_opt_get(opts, "menu")) {
> +*flags = BOOT_MENU_FLAG_ZIPL_OPTS;
> +} else if (boot_menu) {
> +*flags = BOOT_MENU_FLAG_CMD_OPTS;
> +
> +tmp = qemu_opt_get(opts, "splash-time");
> +
> +if (tmp && qemu_strtoul(tmp, NULL, 10, &splash_time)) {
> +error_report("splash-time is invalid, forcing it to 0.");
> +splash_time = 0;

The earlier version of this patch used "*timeout = 0", which was OK. Now
you've changed it to the local variable splash_time, but also kept the
return statement below. This is bad. Either change it back to *timeout
or drop the return statement.

> +return;
> +}
> +
> +if (splash_time > 0x) {
> +error_report("splash-time is too large, forcing it to max 
> value.");
> +splash_time = 0x;
> +return;

dito.

> +}
> +
> +*timeout = cpu_to_be32(splash_time);
> +}
> +}
> +
>  static bool s390_gen_initial_iplb(S390IPLState *ipl)
>  {
>  DeviceState *dev_st;
> @@ -435,6 +482,7 @@ void s390_ipl_prepare_cpu(S390CPU *cpu)
>  }
>  ipl->iplb.qipl.netboot_start_addr = cpu_to_be64(ipl->start_addr);
>  }
> +s390_ipl_set_boot_menu(&ipl->iplb);
>  s390_ipl_prepare_qipl(cpu);
>  
>  }
> diff --git a/hw/s390x/ipl.h b/hw/s390x/ipl.h
> index 74469b1..f632c59 100644
> --- a/hw/s390x/ipl.h
> +++ b/hw/s390x/ipl.h
> @@ -60,6 +60,9 @@ typedef struct IplBlockQemuScsi IplBlockQemuScsi;
>  
>  #define QIPL_ADDRESS  0xcc
>  
> +#define BOOT_MENU_FLAG_CMD_OPTS  0x80
> +#define BOOT_MENU_FLAG_ZIPL_OPTS 0x40
> +
>  /*
>   * The QEMU IPL Parameters will be stored 32-bit word aligned.
>   * Placement of data fields in this area must account for
> @@ -67,9 +70,11 @@ typedef struct IplBlockQemuScsi IplBlockQemuScsi;
>   * The entire structure must not be larger than 28 bytes.
>   */
>  struct QemuIplParameters {
> -uint8_t  reserved1[4];
> +uint8_t  boot_menu_flags;
> +uint8_t  reserved1[3];
> +uint32_t boot_menu_timeout;
>  uint64_t netboot_start_addr;
> -uint8_t  reserved2[16];
> +uint8_t  reserved2[12];
>  } QEMU_PACKED;
>  typedef struct QemuIplParameters QemuIplParameters;
>  
> diff --git a/pc-bios/s390-ccw/iplb.h b/pc-bios/s390-ccw/iplb.h
> index a23237e..0e39aa0 100644
> --- a/pc-bios/s390-ccw/iplb.h
> +++ b/pc-bios/s390-ccw/iplb.h
> @@ -81,9 +81,11 @@ extern IplParameterBlock iplb 
> __attribute__((__aligned__(PAGE_SIZE)));
>   * The entire structure must not be larger than 28 bytes.
>   */
>  struct QemuIplParameters {
> -uint8_t  reserved1[4];
> +uint8_t  boot_menu_flags;
> +uint8_t  reserved1[3];
> +uint32_t boot_menu_timeout;
>  uint64_t netboot_start_addr;
> -uint8_t

Re: [Qemu-devel] [PATCH] kvm: add stubs for kvm_vcpu_id_is_valid() and kvm_arch_vcpu_id()

2018-02-17 Thread Paolo Bonzini
On 16/02/2018 19:11, Greg Kurz wrote:
> These two functions are essentially called by code that is only
> compiled when CONFIG_KVM=y, with the notable exception of the
> two users in the sPAPR code:
> 
> $ git grep -E -l 'kvm_arch_vcpu_id|kvm_vcpu_id_is_valid'
> accel/kvm/kvm-all.c
> hw/intc/openpic_kvm.c
> hw/intc/xics_kvm.c
> hw/ppc/spapr.c
> include/sysemu/kvm.h
> target/arm/kvm.c
> target/i386/kvm.c
> target/mips/kvm.c
> target/ppc/kvm.c
> target/s390x/kvm.c
> 
> In hw/ppc/spapr.c:
> 
> if (kvm_enabled()) {
> return kvm_arch_vcpu_id(cs);
> } else {
> return cs->cpu_index;
> }
> 
> and
> 
> if (kvm_enabled() && !kvm_vcpu_id_is_valid(cpu->vcpu_id)) {
> ...
> }
> 
> This code happens to compile without CONFIG_KVM=y simply because
> kvm_enabled() expands to (0) and the compiler optimizes the dead
> code away. Unless this was done on purpose

Yes, it is.  There are more examples, the first I saw is:

uint32_t tbfreq = kvm_enabled() ? kvmppc_get_tbfreq()
: SPAPR_TIMEBASE_FREQ;
uint32_t cpufreq = kvm_enabled() ? kvmppc_get_clockfreq() : 10;

> to indicate no stubs
> are required, and we'd rather break the build if calling these
> from KVM agnostic code

That's the idea. :)

Paolo



Re: [Qemu-devel] [Process] QEMU submaintainers and pull requests

2018-02-17 Thread Paolo Bonzini
On 16/02/2018 17:09, Cornelia Huck wrote:
>> BTW, Fam and I are also planning to improve Patchew so that it is better
>> at detecting pull requests and especially merged pull requests.
> 
> Cool.
> 
> Would a certain formatting be helpful for patchew as well? Or do you
> plan to parse the cover letter to hopefully find the output of git
> request-pull and get the correct base from that?

Yes, basically replace "git am" with "git fetch" if the cover letter has
a line that looks like it's produced by "git request-pull".

Paolo



Re: [Qemu-devel] [PULL 09/23] gdbstub: Fix vCont behaviour

2018-02-17 Thread Jan Kiszka
On 2017-02-16 15:31, Paolo Bonzini wrote:
> From: Claudio Imbrenda 
> 
> When GDB issues a "vCont", QEMU was not handling it correctly when
> multiple VCPUs are active.
> For vCont, for each thread (VCPU), it can be specified whether to
> single step, continue or stop that thread. The default is to stop a
> thread.
> However, when (for example) "vCont;s:2" is issued, all VCPUs continue
> to run, although all but VCPU nr 2 are to be stopped.
> 
> This patch completely rewrites the vCont parsing code.
> 
> Please note that this improvement only works in system emulation mode,
> when in userspace emulation mode the old behaviour is preserved.
> 
> Signed-off-by: Claudio Imbrenda 
> Message-Id: <1487092068-16562-3-git-send-email-imbre...@linux.vnet.ibm.com>
> Signed-off-by: Paolo Bonzini 
> ---
>  gdbstub.c | 209 
> --
>  1 file changed, 162 insertions(+), 47 deletions(-)
> 
> diff --git a/gdbstub.c b/gdbstub.c
> index 755a8e3..9911153 100644
> --- a/gdbstub.c
> +++ b/gdbstub.c
> @@ -387,6 +387,60 @@ static inline void gdb_continue(GDBState *s)
>  #endif
>  }
>  
> +/*
> + * Resume execution, per CPU actions. For user-mode emulation it's
> + * equivalent to gdb_continue.
> + */
> +static int gdb_continue_partial(GDBState *s, char *newstates)
> +{
> +CPUState *cpu;
> +int res = 0;
> +#ifdef CONFIG_USER_ONLY
> +/*
> + * This is not exactly accurate, but it's an improvement compared to the
> + * previous situation, where only one CPU would be single-stepped.
> + */
> +CPU_FOREACH(cpu) {
> +if (newstates[cpu->cpu_index] == 's') {
> +cpu_single_step(cpu, sstep_flags);
> +}
> +}
> +s->running_state = 1;
> +#else
> +int flag = 0;
> +
> +if (!runstate_needs_reset()) {
> +if (vm_prepare_start()) {
> +return 0;
> +}
> +
> +CPU_FOREACH(cpu) {
> +switch (newstates[cpu->cpu_index]) {
> +case 0:
> +case 1:
> +break; /* nothing to do here */
> +case 's':
> +cpu_single_step(cpu, sstep_flags);
> +cpu_resume(cpu);
> +flag = 1;
> +break;
> +case 'c':
> +cpu_resume(cpu);
> +flag = 1;
> +break;
> +default:
> +res = -1;
> +break;
> +}
> +}
> +}
> +if (flag) {
> +qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
> +}
> +#endif
> +return res;
> +}
> +
>  static void put_buffer(GDBState *s, const uint8_t *buf, int len)
>  {
>  #ifdef CONFIG_USER_ONLY
> @@ -785,6 +839,107 @@ static int is_query_packet(const char *p, const char 
> *query, char separator)
>  (p[query_len] == '\0' || p[query_len] == separator);
>  }
>  
> +/**
> + * gdb_handle_vcont - Parses and handles a vCont packet.
> + * returns -ENOTSUP if a command is unsupported, -EINVAL or -ERANGE if there 
> is
> + * a format error, 0 on success.
> + */
> +static int gdb_handle_vcont(GDBState *s, const char *p)
> +{
> +int res, idx, signal = 0;
> +char cur_action;
> +char *newstates;
> +unsigned long tmp;
> +CPUState *cpu;
> +#ifdef CONFIG_USER_ONLY
> +int max_cpus = 1; /* global variable max_cpus exists only in system mode 
> */
> +
> +CPU_FOREACH(cpu) {
> +max_cpus = max_cpus <= cpu->cpu_index ? cpu->cpu_index + 1 : 
> max_cpus;
> +}
> +#endif
> +/* uninitialised CPUs stay 0 */
> +newstates = g_new0(char, max_cpus);
> +
> +/* mark valid CPUs with 1 */
> +CPU_FOREACH(cpu) {
> +newstates[cpu->cpu_index] = 1;
> +}
> +
> +/*
> + * res keeps track of what error we are returning, with -ENOTSUP meaning
> + * that the command is unknown or unsupported, thus returning an empty
> + * packet, while -EINVAL and -ERANGE cause an E22 packet, due to invalid,
> + *  or incorrect parameters passed.
> + */
> +res = 0;
> +while (*p) {
> +if (*p++ != ';') {
> +res = -ENOTSUP;
> +goto out;
> +}
> +
> +cur_action = *p++;
> +if (cur_action == 'C' || cur_action == 'S') {
> +cur_action = tolower(cur_action);
> +res = qemu_strtoul(p + 1, &p, 16, &tmp);
> +if (res) {
> +goto out;
> +}
> +signal = gdb_signal_to_target(tmp);
> +} else if (cur_action != 'c' && cur_action != 's') {
> +/* unknown/invalid/unsupported command */
> +res = -ENOTSUP;
> +goto out;
> +}
> +/* thread specification. special values: (none), -1 = all; 0 = any */
> +if ((p[0] == ':' && p[1] == '-' && p[2] == '1') || (p[0] != ':')) {
> +if (*p == ':') {
> +p += 3;
> +}
> +for (idx = 0; idx < max_cpus; idx++) {
> +if (newstates[idx]

Re: [Qemu-devel] [PULL 09/23] gdbstub: Fix vCont behaviour

2018-02-17 Thread Jan Kiszka
On 2018-02-17 09:56, Jan Kiszka wrote:
> On 2017-02-16 15:31, Paolo Bonzini wrote:
>> From: Claudio Imbrenda 
>>
>> When GDB issues a "vCont", QEMU was not handling it correctly when
>> multiple VCPUs are active.
>> For vCont, for each thread (VCPU), it can be specified whether to
>> single step, continue or stop that thread. The default is to stop a
>> thread.
>> However, when (for example) "vCont;s:2" is issued, all VCPUs continue
>> to run, although all but VCPU nr 2 are to be stopped.
>>
>> This patch completely rewrites the vCont parsing code.
>>
>> Please note that this improvement only works in system emulation mode,
>> when in userspace emulation mode the old behaviour is preserved.
>>
>> Signed-off-by: Claudio Imbrenda 
>> Message-Id: <1487092068-16562-3-git-send-email-imbre...@linux.vnet.ibm.com>
>> Signed-off-by: Paolo Bonzini 
>> ---
>>  gdbstub.c | 209 
>> --
>>  1 file changed, 162 insertions(+), 47 deletions(-)
>>
>> diff --git a/gdbstub.c b/gdbstub.c
>> index 755a8e3..9911153 100644
>> --- a/gdbstub.c
>> +++ b/gdbstub.c
>> @@ -387,6 +387,60 @@ static inline void gdb_continue(GDBState *s)
>>  #endif
>>  }
>>  
>> +/*
>> + * Resume execution, per CPU actions. For user-mode emulation it's
>> + * equivalent to gdb_continue.
>> + */
>> +static int gdb_continue_partial(GDBState *s, char *newstates)
>> +{
>> +CPUState *cpu;
>> +int res = 0;
>> +#ifdef CONFIG_USER_ONLY
>> +/*
>> + * This is not exactly accurate, but it's an improvement compared to the
>> + * previous situation, where only one CPU would be single-stepped.
>> + */
>> +CPU_FOREACH(cpu) {
>> +if (newstates[cpu->cpu_index] == 's') {
>> +cpu_single_step(cpu, sstep_flags);
>> +}
>> +}
>> +s->running_state = 1;
>> +#else
>> +int flag = 0;
>> +
>> +if (!runstate_needs_reset()) {
>> +if (vm_prepare_start()) {
>> +return 0;
>> +}
>> +
>> +CPU_FOREACH(cpu) {
>> +switch (newstates[cpu->cpu_index]) {
>> +case 0:
>> +case 1:
>> +break; /* nothing to do here */
>> +case 's':
>> +cpu_single_step(cpu, sstep_flags);
>> +cpu_resume(cpu);
>> +flag = 1;
>> +break;
>> +case 'c':
>> +cpu_resume(cpu);
>> +flag = 1;
>> +break;
>> +default:
>> +res = -1;
>> +break;
>> +}
>> +}
>> +}
>> +if (flag) {
>> +qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
>> +}
>> +#endif
>> +return res;
>> +}
>> +
>>  static void put_buffer(GDBState *s, const uint8_t *buf, int len)
>>  {
>>  #ifdef CONFIG_USER_ONLY
>> @@ -785,6 +839,107 @@ static int is_query_packet(const char *p, const char 
>> *query, char separator)
>>  (p[query_len] == '\0' || p[query_len] == separator);
>>  }
>>  
>> +/**
>> + * gdb_handle_vcont - Parses and handles a vCont packet.
>> + * returns -ENOTSUP if a command is unsupported, -EINVAL or -ERANGE if 
>> there is
>> + * a format error, 0 on success.
>> + */
>> +static int gdb_handle_vcont(GDBState *s, const char *p)
>> +{
>> +int res, idx, signal = 0;
>> +char cur_action;
>> +char *newstates;
>> +unsigned long tmp;
>> +CPUState *cpu;
>> +#ifdef CONFIG_USER_ONLY
>> +int max_cpus = 1; /* global variable max_cpus exists only in system 
>> mode */
>> +
>> +CPU_FOREACH(cpu) {
>> +max_cpus = max_cpus <= cpu->cpu_index ? cpu->cpu_index + 1 : 
>> max_cpus;
>> +}
>> +#endif
>> +/* uninitialised CPUs stay 0 */
>> +newstates = g_new0(char, max_cpus);
>> +
>> +/* mark valid CPUs with 1 */
>> +CPU_FOREACH(cpu) {
>> +newstates[cpu->cpu_index] = 1;
>> +}
>> +
>> +/*
>> + * res keeps track of what error we are returning, with -ENOTSUP meaning
>> + * that the command is unknown or unsupported, thus returning an empty
>> + * packet, while -EINVAL and -ERANGE cause an E22 packet, due to 
>> invalid,
>> + *  or incorrect parameters passed.
>> + */
>> +res = 0;
>> +while (*p) {
>> +if (*p++ != ';') {
>> +res = -ENOTSUP;
>> +goto out;
>> +}
>> +
>> +cur_action = *p++;
>> +if (cur_action == 'C' || cur_action == 'S') {
>> +cur_action = tolower(cur_action);
>> +res = qemu_strtoul(p + 1, &p, 16, &tmp);
>> +if (res) {
>> +goto out;
>> +}
>> +signal = gdb_signal_to_target(tmp);
>> +} else if (cur_action != 'c' && cur_action != 's') {
>> +/* unknown/invalid/unsupported command */
>> +res = -ENOTSUP;
>> +goto out;
>> +}
>> +/* thread specification. special values: (none), -1 = all; 0 = any 
>> */
>> +if ((p[0] == ':' && p[1] == '-' && p[2] 

Re: [Qemu-devel] [Qemu-ppc] [PATCH v2 0/3] Sam460ex emulation

2018-02-17 Thread BALATON Zoltan

On Sat, 17 Feb 2018, Thomas Huth wrote:

On 16.02.2018 11:55, BALATON Zoltan wrote:

On Fri, 16 Feb 2018, Thomas Huth wrote:

On 15.02.2018 22:27, BALATON Zoltan wrote:

Remaining patches for Sam460ex emulation. The original cover letter
with more details is here:

http://lists.nongnu.org/archive/html/qemu-ppc/2017-08/msg00112.html

We'll need to also add binaries for firmware (customised u-boot
version) and dtb but I'm not sure how to submit those.


For the dtb, I think you could simply provide a patch that adds the dts
file to the pc-bios directory and another one that adds the dtb. Just
like it is already done with pc-bios/bamboo.dts / pc-bios/bamboo.dtb.


OK thanks, I'll do that. Does it have to be two separate patches?


I don't think so, I just thought that would be cleaner ... but one patch
should be fine, too, I guess. David?


For u-boot, can you use the same upstream level as e500 ? I.e. check
whether "git submodule status roms/u-boot" is fine for you? If that's
ok, just do a "git submodule update roms/u-boot" and build uboot from
that directory - you then can submit a binary patch with that file for
pc-bios, too.

In case you need another u-boot version, I think you've got to update
the submodule to the newer upstream version first, and then also rebuild
the e500 binary... Cumbersome, but that's necessary since we've got to
ship the u-boot sources in the QEMU release tarballs, too, to be
compliant with the GPL.


Unfortunately we can't use the same u-boot as e500 because this board
uses a forked and patched version which is not in upstream u-boot and
upstream u-boot has even dropped support for this CPU in latest version
so we actually need an older version (with patches) and not a newer one.


That's very unfortunate ... any chance that you could try to get that
CPU activated in upstream u-boot again and get the patches included there?


Not likely because it was removed to simplify code and get rid of 
unmaintained CPUs. Also the hardware vendor never upstreamed the board 
code which only exists in their sources, but these u-boot modifications 
are needed for boot loaders running on the hardware so we cannot use stock 
u-boot if we want to boot usual OSes which we want as that's the 
interesting part of this hardware.


Regards,
BALATON Zoltan


Therefore, it needs to be a binary built from a separate source so I
think a new submodule will need to be added for this. How to do that?
Where to host this git repo? Should I put it on github and refer to that
as an external repo or should it be hosted in qemu repo somehow?


No clue ... adding Stefan and Jeff to CC:, maybe they can recommend
something here.

Thomas






Re: [Qemu-devel] [PATCH v2] linux-user: make getcpu optional

2018-02-17 Thread Laurent Vivier
Le 17/02/2018 à 03:02, Mike Frysinger a écrit :
> Not all arches implement this, and the kernel doesn't require them to.
> Add ifdef logic to disable it when not available.
> 
> Signed-off-by: Mike Frysinger 
> ---
>  linux-user/syscall.c | 4 
>  1 file changed, 4 insertions(+)
> 
> diff --git a/linux-user/syscall.c b/linux-user/syscall.c
> index 799c8e2800ea..a9904fac791f 100644
> --- a/linux-user/syscall.c
> +++ b/linux-user/syscall.c
> @@ -290,8 +290,10 @@ _syscall3(int, sys_sched_getaffinity, pid_t, pid, 
> unsigned int, len,
>  #define __NR_sys_sched_setaffinity __NR_sched_setaffinity
>  _syscall3(int, sys_sched_setaffinity, pid_t, pid, unsigned int, len,
>unsigned long *, user_mask_ptr);
> +#ifdef TARGET_NR_getcpu
>  #define __NR_sys_getcpu __NR_getcpu
>  _syscall3(int, sys_getcpu, unsigned *, cpu, unsigned *, node, void *, 
> tcache);
> +#endif

I didn't find any arch that doesn't have it in linux-user/*/syscall_nr.h
Which arches are you speaking about?

As it can be also undefined for the host arch, you should use:

#if defined(TARGET_NR_getcpu) && defined(__NR_getcpu)

Thanks,
Laurent



[Qemu-devel] [PATCH] intel-iommu: Accept 64-bit writes to FEADDR

2018-02-17 Thread Jan Kiszka
From: Jan Kiszka 

Xen is doing this [1] and currently triggers an abort.

[1] 
http://xenbits.xenproject.org/gitweb/?p=xen.git;a=blob;f=xen/drivers/passthrough/vtd/iommu.c;h=daaed0abbdd06b6ba3d948ea103aadf02651e83c;hb=refs/heads/master#l1108

Reported-by: Luis Lloret 
Signed-off-by: Jan Kiszka 
---
 hw/i386/intel_iommu.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 2e841cde27..b61d0da270 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -2129,7 +2129,12 @@ static void vtd_mem_write(void *opaque, hwaddr addr,
 
 /* Fault Event Address Register, 32-bit */
 case DMAR_FEADDR_REG:
-assert(size == 4);
+/*
+ * While the register is 32-bit only, some guests (Xen...) write to it
+ * with 64-bit. Ignore the upper part, that's likely what the hardware
+ * does as well (plus the upper part is not used by our model anyway).
+ */
+assert(size >= 4);
 vtd_set_long(s, addr, val);
 break;
 
-- 
2.13.6



[Qemu-devel] [PATCH] ui: Reorder vte terminal packing to avoid gtk3 warnings

2018-02-17 Thread Jan Kiszka
From: Jan Kiszka 

Fill the terminal box from right to left to avoid

Gtk-WARNING **: Allocating size to GtkScrollbar 0x55f6d54b0200 without
calling gtk_widget_get_preferred_width/height(). How does the code
know the size to allocate?

Signed-off-by: Jan Kiszka 
---
 ui/gtk.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ui/gtk.c b/ui/gtk.c
index 1537751afa..81f729316b 100644
--- a/ui/gtk.c
+++ b/ui/gtk.c
@@ -1945,8 +1945,8 @@ static GSList *gd_vc_vte_init(GtkDisplayState *s, 
VirtualConsole *vc,
 scrollbar = gtk_vscrollbar_new(vadjustment);
 #endif
 
-gtk_box_pack_start(GTK_BOX(box), vc->vte.terminal, TRUE, TRUE, 0);
-gtk_box_pack_start(GTK_BOX(box), scrollbar, FALSE, FALSE, 0);
+gtk_box_pack_end(GTK_BOX(box), scrollbar, FALSE, FALSE, 0);
+gtk_box_pack_end(GTK_BOX(box), vc->vte.terminal, TRUE, TRUE, 0);
 
 vc->vte.box = box;
 vc->vte.scrollbar = scrollbar;
-- 
2.13.6



Re: [Qemu-devel] [PATCH v4 00/22] re-factor softfloat and add fp16 functions

2018-02-17 Thread Alex Bennée

Peter Maydell  writes:

> On 6 February 2018 at 16:47, Alex Bennée  wrote:
>> Hi,
>>
>> The main change is applying the __attribute__((flatten)) to some of
>> the public functions that show up in Emilio's dbt-benchmark. This
>> seems to be a cleaner solution that squashing inlines higher up the
>> chain and still leaves the chance for re-use for the less widely used
>> functions. The results are an improvement over v3 by some margin:
>>
>>  NBench score; higher is better
>>
>> 5 +-+---+-++-+---+-+
>>   | ###   +++  |
>>   4.5 +-+...*..*..#.%..%..##..+ system-2.5   +-+
>>   | *  *  # %  %  *  * #  %  %  master |
>> 4 +-+...*..*..#.%..%..*..*.#..%..%softfloat-v3   +-+
>>   3.5 +-+...*..*..#.%..%..*..*.#..%..%softfloat-.+-+
>>   | *  *  # %  %  *  * #  %  %  * *  #  %  %   |
>> 3 +-+...*..*..#.%..%..*..*.#..%..%..*.*..#..%..%.+-+
>>   | *  *  #+%  %  *  * #$$$  %  * *  #  %  %   |
>>   2.5 +-+...*..*..#$$..%..*..*.#..$..%..*.*..#..%..%.+-+
>>   |     #  %%%  *  *  # $  %  *  * #  $  %  * *  #$$$  %   |
>> 2 +-+.*..*..#..%.%..*..*..#.$..%..*..*.#..$..%..*.*..#..$..%.+-+
>>   |   *  *  #  % %  *  *  # $  %  *  * #  $  %  * *  #  $  %   |
>>   1.5 +-+.*..*..#$$$.%..*..*..#.$..%..*..*.#..$..%..*.*..#..$..%.+-+
>> 1 +-+.*..*..#..$.%..*..*..#.$..%..*..*.#..$..%..*.*..#..$..%.+-+
>>   |   *  *  #  $ %  *  *  # $  %  *  * #  $  %  * *  #  $  %   |
>>   0.5 +-+.*..*..#..$.%..*..*..#.$..%..*..*.#..$..%..*.*..#..$..%.+-+
>>   |   *  *  #  $ %  *  *  # $  %  *  * #  $  %  * *  #  $  %   |
>> 0 +-+-###$$$%%--###$$%%%--##$$$%%%--***###$$$%%%-+-+
>>  FOURIER NEURAL NETLU DECOMPOSITIONgmean
>>
>> Slightly easier to read PNG:
>>
>> https://i.imgur.com/XEeL0bC.png
>>
>> I think it's pretty ready for a merge. Shall I submit a pull myself or
>> does it make sense going via someone else? According to MAINTAINERS
>> Peter and Aurelien are responsible for this code...
>>
>> Alex Bennée (22):
>>   fpu/softfloat: implement float16_squash_input_denormal
>>   include/fpu/softfloat: remove USE_SOFTFLOAT_STRUCT_TYPES
>>   fpu/softfloat-types: new header to prevent excessive re-builds
>>   target/*/cpu.h: remove softfloat.h
>>   include/fpu/softfloat: implement float16_abs helper
>>   include/fpu/softfloat: implement float16_chs helper
>>   include/fpu/softfloat: implement float16_set_sign helper
>>   include/fpu/softfloat: add some float16 constants
>>   fpu/softfloat: improve comments on ARM NaN propagation
>>   fpu/softfloat: move the extract functions to the top of the file
>>   fpu/softfloat: define decompose structures
>>   fpu/softfloat: re-factor add/sub
>>   fpu/softfloat: re-factor mul
>>   fpu/softfloat: re-factor div
>>   fpu/softfloat: re-factor muladd
>>   fpu/softfloat: re-factor round_to_int
>>   fpu/softfloat: re-factor float to int/uint
>>   fpu/softfloat: re-factor int/uint to float
>>   fpu/softfloat: re-factor scalbn
>>   fpu/softfloat: re-factor minmax
>>   fpu/softfloat: re-factor compare
>>   fpu/softfloat: re-factor sqrt
>
> If you persuade git to use the --minimal, --patience or --histogram
> git diff option when generating these patches you'll find that it
> doesn't produce unreadable patches that provoke all the checkpatch
> warnings.

I think this is patchew getting confused as I generated the patches
with:

  [diff]
algorithm = minimal

In my qemu.git/.git/config

> That in turn will let you find the genuine warning that
> got lost in all the spurious ones:
>
> Checking PATCH 16/22: fpu/softfloat: re-factor round_to_int...
> WARNING: line over 80 characters
> #127: FILE: fpu/softfloat.c:1261:
> +inc = ((a.frac & roundeven_mask) != frac_lsbm1 ?
> frac_lsbm1 : 0);

Yeah that was in the release but the one character over is the ; and it
seemed nicer keeping all the logic on the same line.

> As far as I can tell from a quick search, the 'histogram'
> algorithm is reckoned to be about as fast as the default but
> much less likely to produce terrible diffs.
>
>   git config --global diff.algorithm histogram
>
> should set it up as the default for all diff-producing purposes
> including generating patches for email.
>
> thanks
> -- PMM


--
Alex Bennée



Re: [Qemu-devel] [PULL 09/23] gdbstub: Fix vCont behaviour

2018-02-17 Thread Alex Bennée

Jan Kiszka  writes:

> On 2018-02-17 09:56, Jan Kiszka wrote:
>> On 2017-02-16 15:31, Paolo Bonzini wrote:
>>> From: Claudio Imbrenda 
>>>
>>> When GDB issues a "vCont", QEMU was not handling it correctly when
>>> multiple VCPUs are active.
>>> For vCont, for each thread (VCPU), it can be specified whether to
>>> single step, continue or stop that thread. The default is to stop a
>>> thread.
>>> However, when (for example) "vCont;s:2" is issued, all VCPUs continue
>>> to run, although all but VCPU nr 2 are to be stopped.
>>>
>>> This patch completely rewrites the vCont parsing code.
>>>
>>> Please note that this improvement only works in system emulation mode,
>>> when in userspace emulation mode the old behaviour is preserved.
>>>
>>> Signed-off-by: Claudio Imbrenda 
>>> Message-Id: <1487092068-16562-3-git-send-email-imbre...@linux.vnet.ibm.com>
>>> Signed-off-by: Paolo Bonzini 
>>> ---
>>>  gdbstub.c | 209 
>>> --
>>>  1 file changed, 162 insertions(+), 47 deletions(-)
>>>

>>
>> Seems like no one is doing guest debugging with kvm on x86 except me,
>> and I'm only doing it too infrequently now: This one broke that use case
>> for SMP guests long ago. How was it tested?
>>
>> To reproduce the bug: set up an x86-64 guest kernel with > 1 core, break
>> on some prominent syscall entry (e.g. sys_execve), continue the guest on
>> hit and it will quickly lock up, even after disabling the breakpoint
>> again. Kernel version doesn't matter (was my first guess), gdb is
>> 7.7.50.20140604-cvs (OpenSUSE) here.

I thought I fixed this with 5a6a1ad181c658b810041d852b290ac836965aca

FWIW I do periodically test ARM TCG and KVM guest debug using:

  tests/guest-debug/test-gdbstub.py

But we are missing a nice integration to get an appropriate guest image
to automate this process. If we can fix that we should be able to turn
on the test as part of make check.


--
Alex Bennée



Re: [Qemu-devel] [PATCH v5 00/23] RISC-V QEMU Port Submission

2018-02-17 Thread Richard W.M. Jones

I just want to mention that we've been running this patch set in
production for a few days, doing hundreds of Fedora RISC-V builds with
‘-smp 4’ and it has been rock solid.

Therefore:

Tested-by: Richard W.M. Jones 

Rich.

-- 
Richard Jones, Virtualization Group, Red Hat http://people.redhat.com/~rjones
Read my programming and virtualization blog: http://rwmj.wordpress.com
virt-df lists disk usage of guests without needing to install any
software inside the virtual machine.  Supports Linux and Windows.
http://people.redhat.com/~rjones/virt-df/



[Qemu-devel] [PATCH 1/3] hw/i2c-ddc: Do not fail writes

2018-02-17 Thread Linus Walleij
The tx function of the DDC I2C slave emulation was returning 1
on all writes resulting in NACK in the I2C bus. Changing it to
0 makes the DDC I2C work fine with bit-banged I2C such as the
versatile I2C.

I guess it was not affecting whatever I2C controller this was
used with until now, but with the Versatile I2C it surely
does not work.

Signed-off-by: Linus Walleij 
---
 hw/i2c/i2c-ddc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/i2c/i2c-ddc.c b/hw/i2c/i2c-ddc.c
index 199dac9e41c1..bec0c91e2dd0 100644
--- a/hw/i2c/i2c-ddc.c
+++ b/hw/i2c/i2c-ddc.c
@@ -259,12 +259,12 @@ static int i2c_ddc_tx(I2CSlave *i2c, uint8_t data)
 s->reg = data;
 s->firstbyte = false;
 DPRINTF("[EDID] Written new pointer: %u\n", data);
-return 1;
+return 0;
 }
 
 /* Ignore all writes */
 s->reg++;
-return 1;
+return 0;
 }
 
 static void i2c_ddc_init(Object *obj)
-- 
2.14.3




[Qemu-devel] [PATCH 2/3] hw/sii9022: Add support for Silicon Image SII9022

2018-02-17 Thread Linus Walleij
This adds support for emulating the Silicon Image SII9022 DVI/HDMI
bridge. It's not very clever right now, it just acknowledges
the switch into DDC I2C mode and back. Combining this with the
existing DDC I2C emulation gives the right behavior on the Versatile
Express emulation passing through the QEMU EDID to the emulated
platform.

Signed-off-by: Linus Walleij 
---
 hw/display/Makefile.objs |   1 +
 hw/display/sii9022.c | 185 +++
 2 files changed, 186 insertions(+)
 create mode 100644 hw/display/sii9022.c

diff --git a/hw/display/Makefile.objs b/hw/display/Makefile.objs
index d3a4cb396eb9..3c7c75b94da5 100644
--- a/hw/display/Makefile.objs
+++ b/hw/display/Makefile.objs
@@ -3,6 +3,7 @@ common-obj-$(CONFIG_VGA_CIRRUS) += cirrus_vga.o
 common-obj-$(CONFIG_G364FB) += g364fb.o
 common-obj-$(CONFIG_JAZZ_LED) += jazz_led.o
 common-obj-$(CONFIG_PL110) += pl110.o
+common-obj-$(CONFIG_SII9022) += sii9022.o
 common-obj-$(CONFIG_SSD0303) += ssd0303.o
 common-obj-$(CONFIG_SSD0323) += ssd0323.o
 common-obj-$(CONFIG_XEN) += xenfb.o
diff --git a/hw/display/sii9022.c b/hw/display/sii9022.c
new file mode 100644
index ..d6f3cdc04293
--- /dev/null
+++ b/hw/display/sii9022.c
@@ -0,0 +1,185 @@
+/*
+ * Silicon Image SiI9022
+ *
+ * This is a pretty hollow emulation: all we do is acknowledge that we
+ * exist (chip ID) and confirm that we get switched over into DDC mode
+ * so the emulated host can proceed to read out EDID data. All subsequent
+ * set-up of connectors etc will be acknowledged and ignored.
+ *
+ * Copyright (c) 2018 Linus Walleij
+ *
+ * This code is licensed under the GNU GPL v2.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "hw/i2c/i2c.h"
+
+#define DEBUG_SII9022 0
+
+#define DPRINTF(fmt, ...) \
+do { \
+if (DEBUG_SII9022) { \
+printf("sii9022: " fmt, ## __VA_ARGS__); \
+} \
+} while (0)
+
+#define SII9022_SYS_CTRL_DATA 0x1a
+#define SII9022_SYS_CTRL_PWR_DWN 0x10
+#define SII9022_SYS_CTRL_AV_MUTE 0x08
+#define SII9022_SYS_CTRL_DDC_BUS_REQ 0x04
+#define SII9022_SYS_CTRL_DDC_BUS_GRTD 0x02
+#define SII9022_SYS_CTRL_OUTPUT_MODE 0x01
+#define SII9022_SYS_CTRL_OUTPUT_HDMI 1
+#define SII9022_SYS_CTRL_OUTPUT_DVI 0
+#define SII9022_REG_CHIPID 0x1b
+#define SII9022_INT_ENABLE 0x3c
+#define SII9022_INT_STATUS 0x3d
+#define SII9022_INT_STATUS_HOTPLUG 0x01;
+#define SII9022_INT_STATUS_PLUGGED 0x04;
+
+#define TYPE_SII9022 "sii9022"
+#define SII9022(obj) OBJECT_CHECK(sii9022_state, (obj), TYPE_SII9022)
+
+typedef struct sii9022_state {
+I2CSlave parent_obj;
+uint8_t ptr;
+bool addr_byte;
+bool ddc_req;
+bool ddc_skip_finish;
+bool ddc;
+} sii9022_state;
+
+static const VMStateDescription vmstate_sii9022 = {
+.name = "sii9022",
+.version_id = 1,
+.minimum_version_id = 1,
+.fields = (VMStateField[]) {
+VMSTATE_I2C_SLAVE(parent_obj, sii9022_state),
+VMSTATE_UINT8(ptr, sii9022_state),
+VMSTATE_BOOL(addr_byte, sii9022_state),
+VMSTATE_BOOL(ddc_req, sii9022_state),
+VMSTATE_BOOL(ddc_skip_finish, sii9022_state),
+VMSTATE_BOOL(ddc, sii9022_state),
+VMSTATE_END_OF_LIST()
+}
+};
+
+static int sii9022_event(I2CSlave *i2c, enum i2c_event event)
+{
+sii9022_state *s = SII9022(i2c);
+
+switch (event) {
+case I2C_START_SEND:
+s->addr_byte = true;
+break;
+case I2C_START_RECV:
+break;
+case I2C_FINISH:
+break;
+case I2C_NACK:
+break;
+}
+
+return 0;
+}
+
+static int sii9022_rx(I2CSlave *i2c)
+{
+sii9022_state *s = SII9022(i2c);
+uint8_t res = 0x00;
+
+switch (s->ptr) {
+case SII9022_SYS_CTRL_DATA:
+if (s->ddc_req) {
+/* Acknowledge DDC bus request */
+res = SII9022_SYS_CTRL_DDC_BUS_GRTD | SII9022_SYS_CTRL_DDC_BUS_REQ;
+}
+break;
+case SII9022_REG_CHIPID:
+res = 0xb0;
+break;
+case SII9022_INT_STATUS:
+/* Something is cold-plugged in, no interrupts */
+res = SII9022_INT_STATUS_PLUGGED;
+break;
+default:
+break;
+}
+DPRINTF("%02x read from %02x\n", res, s->ptr);
+s->ptr++;
+
+return res;
+}
+
+static int sii9022_tx(I2CSlave *i2c, uint8_t data)
+{
+sii9022_state *s = SII9022(i2c);
+
+if (s->addr_byte) {
+s->ptr = data;
+s->addr_byte = false;
+return 0;
+}
+
+switch (s->ptr) {
+case SII9022_SYS_CTRL_DATA:
+if (data & SII9022_SYS_CTRL_DDC_BUS_REQ) {
+s->ddc_req = true;
+if (data & SII9022_SYS_CTRL_DDC_BUS_GRTD) {
+s->ddc = true;
+/* Skip this finish since we just switched to DDC */
+s->ddc_skip_finish = true;
+DPRINTF("switched to DDC mode\n");
+ 

[Qemu-devel] [PATCH 3/3] arm/vexpress: Add proper display connector emulation

2018-02-17 Thread Linus Walleij
This adds the SiI9022 and EDID I2C devices to the ARM Versatile
Express machine, and selects the two I2C devices necessary in the
arm-softmmy.mak configuration so everything will build smoothly.

I am implementing proper handling of the graphics in the Linux
kernel and adding proper emulation of SiI9022 and EDID makes the
driver probe as nicely as before, retrieveing the resolutions
supported by the "QEMU monitor" and overall just working nice.

The assignment of the SiI9022 at address 0x39 and the EDID
DDC I2C at address 0x50 is not strictly correct: the DDC I2C
is there all the time but in the actual component it only
appears once activated inside the SiI9022, so ideally it should
be added and removed to the bus by the SiI9022. However for this
purpose it works fine to just have it around.

Signed-off-by: Linus Walleij 
---
 default-configs/arm-softmmu.mak | 2 ++
 hw/arm/vexpress.c   | 7 ++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/default-configs/arm-softmmu.mak b/default-configs/arm-softmmu.mak
index ca34cf446242..54f855d07206 100644
--- a/default-configs/arm-softmmu.mak
+++ b/default-configs/arm-softmmu.mak
@@ -21,6 +21,8 @@ CONFIG_STELLARIS_INPUT=y
 CONFIG_STELLARIS_ENET=y
 CONFIG_SSD0303=y
 CONFIG_SSD0323=y
+CONFIG_DDC=y
+CONFIG_SII9022=y
 CONFIG_ADS7846=y
 CONFIG_MAX111X=y
 CONFIG_SSI=y
diff --git a/hw/arm/vexpress.c b/hw/arm/vexpress.c
index dc5928ae1ab5..d6c912c97684 100644
--- a/hw/arm/vexpress.c
+++ b/hw/arm/vexpress.c
@@ -29,6 +29,7 @@
 #include "hw/arm/arm.h"
 #include "hw/arm/primecell.h"
 #include "hw/devices.h"
+#include "hw/i2c/i2c.h"
 #include "net/net.h"
 #include "sysemu/sysemu.h"
 #include "hw/boards.h"
@@ -537,6 +538,7 @@ static void vexpress_common_init(MachineState *machine)
 uint32_t sys_id;
 DriveInfo *dinfo;
 pflash_t *pflash0;
+I2CBus *i2c;
 ram_addr_t vram_size, sram_size;
 MemoryRegion *sysmem = get_system_memory();
 MemoryRegion *vram = g_new(MemoryRegion, 1);
@@ -628,7 +630,10 @@ static void vexpress_common_init(MachineState *machine)
 sysbus_create_simple("sp804", map[VE_TIMER01], pic[2]);
 sysbus_create_simple("sp804", map[VE_TIMER23], pic[3]);
 
-/* VE_SERIALDVI: not modelled */
+dev = sysbus_create_simple("versatile_i2c", map[VE_SERIALDVI], NULL);
+i2c = (I2CBus *)qdev_get_child_bus(dev, "i2c");
+i2c_create_slave(i2c, "sii9022", 0x39);
+i2c_create_slave(i2c, "i2c-ddc", 0x50);
 
 sysbus_create_simple("pl031", map[VE_RTC], pic[4]); /* RTC */
 
-- 
2.14.3




Re: [Qemu-devel] [PATCH] cuda.h: Fix multiple typedef

2018-02-17 Thread Mark Cave-Ayland

On 16/02/18 18:38, Peter Maydell wrote:


On 16 February 2018 at 17:31, Dr. David Alan Gilbert (git)
 wrote:

From: "Dr. David Alan Gilbert" 

RHEL6's compilers don't like the repeated typedef.

Signed-off-by: Dr. David Alan Gilbert 
---
  include/hw/misc/macio/cuda.h | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/hw/misc/macio/cuda.h b/include/hw/misc/macio/cuda.h
index 6afbdd13ee..494b709579 100644
--- a/include/hw/misc/macio/cuda.h
+++ b/include/hw/misc/macio/cuda.h
@@ -93,12 +93,12 @@ typedef struct CUDAState {
  } CUDAState;

  /* MOS6522 CUDA */
-typedef struct MOS6522CUDAState {
+struct MOS6522CUDAState {
  /*< private >*/
  MOS6522State parent_obj;

  CUDAState *cuda;
-} MOS6522CUDAState;
+};

  #define TYPE_MOS6522_CUDA "mos6522-cuda"
  #define MOS6522_CUDA(obj) OBJECT_CHECK(MOS6522CUDAState, (obj), \
--


Thanks; applied to master as a  buildfix.


Thanks both. Apologies I got caught up later than expected yesterday.


ATB,

Mark.



[Qemu-devel] [PATCH 0/2] tcg: tcg_can_emit_vec_op cleanup+fix

2018-02-17 Thread Richard Henderson
While rearranging/rebasing my SVE patch set, I triggered an
assert in tcg_gen_mul_vec.  Turns out we should not have gone
down that path, due to a missing tcg_can_emit_vec_op check.

Tidy the usage of tcg_can_emit_vec_op to avoid code clutter.


r~


Richard Henderson (2):
  tcg: Fold unspecified opcode test into tcg_can_emit_vec_op
  tcg: Add missing tcg_can_emit_vec_op check in tcg_gen_gvec_2s

 tcg/aarch64/tcg-target.inc.c |  4 
 tcg/i386/tcg-target.inc.c|  4 
 tcg/tcg-op-gvec.c| 37 ++---
 3 files changed, 26 insertions(+), 19 deletions(-)

-- 
2.14.3




[Qemu-devel] [PATCH 2/2] tcg: Add missing tcg_can_emit_vec_op check in tcg_gen_gvec_2s

2018-02-17 Thread Richard Henderson
This lead to an assertion failure for 64-bit vector multiply,
which is not available in the AVX instruction set.

Signed-off-by: Richard Henderson 
---
 tcg/tcg-op-gvec.c | 9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
index 29f9cf34b4..432e577c35 100644
--- a/tcg/tcg-op-gvec.c
+++ b/tcg/tcg-op-gvec.c
@@ -979,12 +979,15 @@ void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, 
uint32_t oprsz,
 
 type = 0;
 if (g->fniv) {
-if (TCG_TARGET_HAS_v256 && check_size_impl(oprsz, 32)) {
+if (TCG_TARGET_HAS_v256 && check_size_impl(oprsz, 32)
+&& tcg_can_emit_vec_op(g->opc, TCG_TYPE_V256, g->vece)) {
 type = TCG_TYPE_V256;
-} else if (TCG_TARGET_HAS_v128 && check_size_impl(oprsz, 16)) {
+} else if (TCG_TARGET_HAS_v128 && check_size_impl(oprsz, 16)
+   && tcg_can_emit_vec_op(g->opc, TCG_TYPE_V128, g->vece)) {
 type = TCG_TYPE_V128;
 } else if (TCG_TARGET_HAS_v64 && !g->prefer_i64
-   && check_size_impl(oprsz, 8)) {
+   && check_size_impl(oprsz, 8)
+   && tcg_can_emit_vec_op(g->opc, TCG_TYPE_V64, g->vece)) {
 type = TCG_TYPE_V64;
 }
 }
-- 
2.14.3




[Qemu-devel] [PATCH 1/2] tcg: Fold unspecified opcode test into tcg_can_emit_vec_op

2018-02-17 Thread Richard Henderson
This releases the callers from having to check themselves,
which tidies up the code a bit.

Signed-off-by: Richard Henderson 
---
 tcg/aarch64/tcg-target.inc.c |  4 
 tcg/i386/tcg-target.inc.c|  4 
 tcg/tcg-op-gvec.c| 28 
 3 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/tcg/aarch64/tcg-target.inc.c b/tcg/aarch64/tcg-target.inc.c
index be3192078d..9b0a803d79 100644
--- a/tcg/aarch64/tcg-target.inc.c
+++ b/tcg/aarch64/tcg-target.inc.c
@@ -2217,6 +2217,10 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
 int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
 {
 switch (opc) {
+case 0:
+/* Unspecified opcode */
+return 1;
+
 case INDEX_op_add_vec:
 case INDEX_op_sub_vec:
 case INDEX_op_mul_vec:
diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
index fc05909d1d..45943e540c 100644
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@@ -3064,6 +3064,10 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode 
op)
 int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
 {
 switch (opc) {
+case 0:
+/* Unspecified opcode.  */
+return 1;
+
 case INDEX_op_add_vec:
 case INDEX_op_sub_vec:
 case INDEX_op_and_vec:
diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
index bfe44bba81..29f9cf34b4 100644
--- a/tcg/tcg-op-gvec.c
+++ b/tcg/tcg-op-gvec.c
@@ -878,7 +878,7 @@ void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
store operation.  This is true for aarch64 and x86_64 hosts.  */
 
 if (TCG_TARGET_HAS_v256 && g->fniv && check_size_impl(oprsz, 32)
-&& (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V256, g->vece))) {
+&& tcg_can_emit_vec_op(g->opc, TCG_TYPE_V256, g->vece)) {
 uint32_t some = QEMU_ALIGN_DOWN(oprsz, 32);
 expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, g->fniv);
 if (some == oprsz) {
@@ -891,12 +891,11 @@ void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
 }
 
 if (TCG_TARGET_HAS_v128 && g->fniv && check_size_impl(oprsz, 16)
-&& (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V128, g->vece))) {
+&& tcg_can_emit_vec_op(g->opc, TCG_TYPE_V128, g->vece)) {
 expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, g->fniv);
 } else if (TCG_TARGET_HAS_v64 && !g->prefer_i64
&& g->fniv && check_size_impl(oprsz, 8)
-   && (!g->opc
-   || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V64, g->vece))) {
+   && tcg_can_emit_vec_op(g->opc, TCG_TYPE_V64, g->vece)) {
 expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, g->fniv);
 } else if (g->fni8 && check_size_impl(oprsz, 8)) {
 expand_2_i64(dofs, aofs, oprsz, g->fni8);
@@ -926,7 +925,7 @@ void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t 
oprsz,
that e.g. oprsz == 80 would be expanded with 2x32 + 1x16.  */
 
 if (TCG_TARGET_HAS_v256 && g->fniv && check_size_impl(oprsz, 32)
-&& (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V256, g->vece))) {
+&& tcg_can_emit_vec_op(g->opc, TCG_TYPE_V256, g->vece)) {
 uint32_t some = QEMU_ALIGN_DOWN(oprsz, 32);
 expand_2i_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
   c, g->load_dest, g->fniv);
@@ -940,13 +939,12 @@ void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, 
uint32_t oprsz,
 }
 
 if (TCG_TARGET_HAS_v128 && g->fniv && check_size_impl(oprsz, 16)
-&& (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V128, g->vece))) {
+&& tcg_can_emit_vec_op(g->opc, TCG_TYPE_V128, g->vece)) {
 expand_2i_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
   c, g->load_dest, g->fniv);
 } else if (TCG_TARGET_HAS_v64 && !g->prefer_i64
&& g->fniv && check_size_impl(oprsz, 8)
-   && (!g->opc
-   || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V64, g->vece))) {
+   && tcg_can_emit_vec_op(g->opc, TCG_TYPE_V64, g->vece)) {
 expand_2i_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
   c, g->load_dest, g->fniv);
 } else if (g->fni8 && check_size_impl(oprsz, 8)) {
@@ -1063,7 +1061,7 @@ void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, 
uint32_t bofs,
that e.g. oprsz == 80 would be expanded with 2x32 + 1x16.  */
 
 if (TCG_TARGET_HAS_v256 && g->fniv && check_size_impl(oprsz, 32)
-&& (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V256, g->vece))) {
+&& tcg_can_emit_vec_op(g->opc, TCG_TYPE_V256, g->vece)) {
 uint32_t some = QEMU_ALIGN_DOWN(oprsz, 32);
 expand_3_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
  g->load_dest, g->fniv);
@@ -1078,13 +1076,12 @@ void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, 
uint32_t bofs,
 }
 
 if (TCG_TARGET_HAS_v128 && g->fniv && check_size_impl(oprsz, 16

Re: [Qemu-devel] [PULL 09/23] gdbstub: Fix vCont behaviour

2018-02-17 Thread Jan Kiszka
On 2018-02-17 14:27, Alex Bennée wrote:
> 
> Jan Kiszka  writes:
> 
>> On 2018-02-17 09:56, Jan Kiszka wrote:
>>> On 2017-02-16 15:31, Paolo Bonzini wrote:
 From: Claudio Imbrenda 

 When GDB issues a "vCont", QEMU was not handling it correctly when
 multiple VCPUs are active.
 For vCont, for each thread (VCPU), it can be specified whether to
 single step, continue or stop that thread. The default is to stop a
 thread.
 However, when (for example) "vCont;s:2" is issued, all VCPUs continue
 to run, although all but VCPU nr 2 are to be stopped.

 This patch completely rewrites the vCont parsing code.

 Please note that this improvement only works in system emulation mode,
 when in userspace emulation mode the old behaviour is preserved.

 Signed-off-by: Claudio Imbrenda 
 Message-Id: <1487092068-16562-3-git-send-email-imbre...@linux.vnet.ibm.com>
 Signed-off-by: Paolo Bonzini 
 ---
  gdbstub.c | 209 
 --
  1 file changed, 162 insertions(+), 47 deletions(-)

> 
>>>
>>> Seems like no one is doing guest debugging with kvm on x86 except me,
>>> and I'm only doing it too infrequently now: This one broke that use case
>>> for SMP guests long ago. How was it tested?
>>>
>>> To reproduce the bug: set up an x86-64 guest kernel with > 1 core, break
>>> on some prominent syscall entry (e.g. sys_execve), continue the guest on
>>> hit and it will quickly lock up, even after disabling the breakpoint
>>> again. Kernel version doesn't matter (was my first guess), gdb is
>>> 7.7.50.20140604-cvs (OpenSUSE) here.
> 
> I thought I fixed this with 5a6a1ad181c658b810041d852b290ac836965aca
> 
> FWIW I do periodically test ARM TCG and KVM guest debug using:
> 
>   tests/guest-debug/test-gdbstub.py
> 
> But we are missing a nice integration to get an appropriate guest image
> to automate this process. If we can fix that we should be able to turn
> on the test as part of make check.
> 

If that test above is extended with more interesting setups, that should
be enough. E.g., you can reproduce this issue by running qemu with -smp
4 and the following test modifications.

diff --git a/tests/guest-debug/test-gdbstub.py 
b/tests/guest-debug/test-gdbstub.py
index 31ba6c943a..a55782fa9a 100644
--- a/tests/guest-debug/test-gdbstub.py
+++ b/tests/guest-debug/test-gdbstub.py
@@ -15,6 +15,7 @@ def report(cond, msg):
 print ("PASS: %s" % (msg))
 else:
 print ("FAIL: %s" % (msg))
+global failcount
 failcount += 1
 
 
@@ -33,6 +34,7 @@ def check_break(sym_name):
 bp = gdb.Breakpoint(sym_name)
 
 gdb.execute("c")
+gdb.execute("c 100")
 
 # hopefully we came back
 end_pc = gdb.parse_and_eval('$pc')
@@ -138,12 +140,12 @@ def run_test():
 # Can't set this up until we are in the kernel proper
 # if we make it to run_init_process we've over-run and
 # one of the tests failed
-print ("Setup catch-all for run_init_process")
-cbp = CatchBreakpoint("run_init_process")
-cpb2 = CatchBreakpoint("try_to_run_init_process")
+#print ("Setup catch-all for run_init_process")
+#cbp = CatchBreakpoint("run_init_process")
+#cpb2 = CatchBreakpoint("try_to_run_init_process")
 
 print ("Checking Normal breakpoint works")
-break_ok = check_break("wait_for_completion")
+break_ok = check_break("SyS_execve")
 report(break_ok, "break @ wait_for_completion")
 
 print ("Checking watchpoint works")


With -smp 1, check_break succeeds.

Jan



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-devel] [PATCH 09/11] linux-user/strace: improve capget()/capset() output

2018-02-17 Thread Laurent Vivier
Le 24/01/2018 à 14:01, Philippe Mathieu-Daudé a écrit :
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
>  linux-user/strace.list | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/linux-user/strace.list b/linux-user/strace.list
> index f90c0e8ee4..f09234345f 100644
> --- a/linux-user/strace.list
> +++ b/linux-user/strace.list
> @@ -59,10 +59,10 @@
>  { TARGET_NR_cacheflush, "cacheflush" , NULL, NULL, NULL },
>  #endif
>  #ifdef TARGET_NR_capget
> -{ TARGET_NR_capget, "capget" , NULL, NULL, NULL },
> +{ TARGET_NR_capget, "capget" , "%s(%p,%p)", NULL, NULL },
>  #endif
>  #ifdef TARGET_NR_capset
> -{ TARGET_NR_capset, "capset" , NULL, NULL, NULL },
> +{ TARGET_NR_capset, "capset" , "%s(%p,%p)", NULL, NULL },
>  #endif
>  #ifdef TARGET_NR_chdir
>  { TARGET_NR_chdir, "chdir" , NULL, print_chdir, NULL },
> 

Reviewed-by: Laurent Vivier 




Re: [Qemu-devel] [PATCH 08/11] linux-user/strace: improve gettimeofday() output

2018-02-17 Thread Laurent Vivier
Le 24/01/2018 à 14:01, Philippe Mathieu-Daudé a écrit :
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
>  linux-user/strace.c| 13 +
>  linux-user/strace.list |  2 +-
>  2 files changed, 14 insertions(+), 1 deletion(-)
> 
> diff --git a/linux-user/strace.c b/linux-user/strace.c
> index 4b8ab6bcfb..b7c4cfae58 100644
> --- a/linux-user/strace.c
> +++ b/linux-user/strace.c
> @@ -1526,6 +1526,19 @@ print_futimesat(const struct syscallname *name,
>  }
>  #endif
>  
> +#ifdef TARGET_NR_gettimeofday
> +static void
> +print_gettimeofday(const struct syscallname *name,
> +   abi_long arg0, abi_long arg1, abi_long arg2,
> +   abi_long arg3, abi_long arg4, abi_long arg5)
> +{
> +print_syscall_prologue(name);
> +print_pointer(arg0, 0);
> +print_pointer(arg1, 1);
> +print_syscall_epilogue(name);
> +}
> +#endif
> +
>  #ifdef TARGET_NR_link
>  static void
>  print_link(const struct syscallname *name,
> diff --git a/linux-user/strace.list b/linux-user/strace.list
> index 958d10d48f..f90c0e8ee4 100644
> --- a/linux-user/strace.list
> +++ b/linux-user/strace.list
> @@ -384,7 +384,7 @@
>  { TARGET_NR_gettid, "gettid" , "%s()", NULL, NULL },
>  #endif
>  #ifdef TARGET_NR_gettimeofday
> -{ TARGET_NR_gettimeofday, "gettimeofday" , NULL, NULL, NULL },
> +{ TARGET_NR_gettimeofday, "gettimeofday" , NULL, print_gettimeofday, NULL },
>  #endif
>  #ifdef TARGET_NR_getuid
>  { TARGET_NR_getuid, "getuid" , "%s()", NULL, NULL },
> 

There is a print_timeval(), and you could add a print_timezone(), and
update the result() function pointer to call them.

And once it is done, you can also display them for settimeofday().

Thanks,
Laurent



Re: [Qemu-devel] [PATCH 07/11] linux-user/strace: improve bind() output

2018-02-17 Thread Laurent Vivier
Le 24/01/2018 à 14:01, Philippe Mathieu-Daudé a écrit :
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
>  linux-user/strace.c| 13 +
>  linux-user/strace.list |  2 +-
>  2 files changed, 14 insertions(+), 1 deletion(-)
> 
> diff --git a/linux-user/strace.c b/linux-user/strace.c
> index 70ecb1fc98..4b8ab6bcfb 100644
> --- a/linux-user/strace.c
> +++ b/linux-user/strace.c
> @@ -1935,6 +1935,19 @@ print_socketcall(const struct syscallname *name,
>  }
>  #endif
>  
> +#if defined(TARGET_NR_bind)
> +static void
> +print_bind(const struct syscallname *name,
> +   abi_long arg0, abi_long arg1, abi_long arg2,
> +   abi_long arg3, abi_long arg4, abi_long arg5)
> +{
> +print_syscall_prologue(name);
> +print_raw_param(TARGET_ABI_FMT_ld, arg0, 0);

Other functions use "%d" for fd.

Thanks,
Laurent



Re: [Qemu-devel] [PATCH 06/11] linux-user/strace: improve recvmsg() output

2018-02-17 Thread Laurent Vivier
Le 24/01/2018 à 14:01, Philippe Mathieu-Daudé a écrit :
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
>  linux-user/strace.list | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/linux-user/strace.list b/linux-user/strace.list
> index 947ff04eab..ae6dc8fecf 100644
> --- a/linux-user/strace.list
> +++ b/linux-user/strace.list
> @@ -1107,7 +1107,7 @@
>  { TARGET_NR_recvmmsg, "recvmmsg" , NULL, NULL, NULL },
>  #endif
>  #ifdef TARGET_NR_recvmsg
> -{ TARGET_NR_recvmsg, "recvmsg" , NULL, NULL, NULL },
> +{ TARGET_NR_recvmsg, "recvmsg" , "%s(%d,%p,%#x)", NULL, NULL },
>  #endif
>  #ifdef TARGET_NR_remap_file_pages
>  { TARGET_NR_remap_file_pages, "remap_file_pages" , NULL, NULL, NULL },
> 

Reviewed-by: Laurent Vivier 




[Qemu-devel] [PATCH v4 1/1] scripts: Add decodetree.py

2018-02-17 Thread Richard Henderson
To be used to decode ARM SVE, but could be used for any fixed-width ISA.

Signed-off-by: Richard Henderson 

---

Changes since v3:
  * Add copyright notices.
  * Do not mark decoder static if --decode is used.
Use this option to imply the decoder is the main entry point
for a separate translator object file.
  * Rename *.def to *.decode.

Changes since v2:
  * Fix tests/decode/err_init3.def.
  * Mark main decoder static by default.
  * Properly diagnose unspecified bits.
  * Remove output file on error.

Changes since v1:
  * Pass pycodestyle-{2,3}.
  * Support 16-bit and 32-bit insns (I have a def file for thumb1).
  * Testsuite (only negative tests so far).
  * Called translate functions default to static.
  * Notice duplicate assignments and missing assignments to fields.
  * Use '-' to indicate a non-decoded bit, as opposed to '.' which
must be filled in elsewhere by a format or a field.
---
 scripts/decodetree.py| 1062 ++
 tests/Makefile.include   |9 +-
 tests/decode/check.sh|   18 +
 tests/decode/err_argset1.decode  |5 +
 tests/decode/err_argset2.decode  |5 +
 tests/decode/err_field1.decode   |5 +
 tests/decode/err_field2.decode   |5 +
 tests/decode/err_field3.decode   |5 +
 tests/decode/err_field4.decode   |6 +
 tests/decode/err_field5.decode   |5 +
 tests/decode/err_init1.decode|6 +
 tests/decode/err_init2.decode|6 +
 tests/decode/err_init3.decode|7 +
 tests/decode/err_init4.decode|7 +
 tests/decode/err_overlap1.decode |6 +
 tests/decode/err_overlap2.decode |6 +
 tests/decode/err_overlap3.decode |6 +
 tests/decode/err_overlap4.decode |6 +
 tests/decode/err_overlap5.decode |5 +
 tests/decode/err_overlap6.decode |6 +
 tests/decode/err_overlap7.decode |6 +
 tests/decode/err_overlap8.decode |5 +
 tests/decode/err_overlap9.decode |6 +
 23 files changed, 1202 insertions(+), 1 deletion(-)
 create mode 100755 scripts/decodetree.py
 create mode 100755 tests/decode/check.sh
 create mode 100644 tests/decode/err_argset1.decode
 create mode 100644 tests/decode/err_argset2.decode
 create mode 100644 tests/decode/err_field1.decode
 create mode 100644 tests/decode/err_field2.decode
 create mode 100644 tests/decode/err_field3.decode
 create mode 100644 tests/decode/err_field4.decode
 create mode 100644 tests/decode/err_field5.decode
 create mode 100644 tests/decode/err_init1.decode
 create mode 100644 tests/decode/err_init2.decode
 create mode 100644 tests/decode/err_init3.decode
 create mode 100644 tests/decode/err_init4.decode
 create mode 100644 tests/decode/err_overlap1.decode
 create mode 100644 tests/decode/err_overlap2.decode
 create mode 100644 tests/decode/err_overlap3.decode
 create mode 100644 tests/decode/err_overlap4.decode
 create mode 100644 tests/decode/err_overlap5.decode
 create mode 100644 tests/decode/err_overlap6.decode
 create mode 100644 tests/decode/err_overlap7.decode
 create mode 100644 tests/decode/err_overlap8.decode
 create mode 100644 tests/decode/err_overlap9.decode

diff --git a/scripts/decodetree.py b/scripts/decodetree.py
new file mode 100755
index 00..22d2edce9d
--- /dev/null
+++ b/scripts/decodetree.py
@@ -0,0 +1,1062 @@
+#!/usr/bin/env python
+# Copyright (c) 2018 Linaro, Inc.
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, see .
+#
+
+#
+# Generate a decoding tree from a specification file.
+#
+# The tree is built from instruction "patterns".  A pattern may represent
+# a single architectural instruction or a group of same, depending on what
+# is convenient for further processing.
+#
+# Each pattern has "fixedbits" & "fixedmask", the combination of which
+# describes the condition under which the pattern is matched:
+#
+#   (insn & fixedmask) == fixedbits
+#
+# Each pattern may have "fields", which are extracted from the insn and
+# passed along to the translator.  Examples of such are registers,
+# immediates, and sub-opcodes.
+#
+# In support of patterns, one may declare fields, argument sets, and
+# formats, each of which may be re-used to simplify further definitions.
+#
+# *** Field syntax:
+#
+# field_def := '%' identifier ( unnamed_field )+ ( !function=identifier )?
+# unnamed_field := number ':' ( 's' ) number
+#
+# For unnamed_field, the first number is the least-significant 

Re: [Qemu-devel] [PATCH 03/11] linux-user/strace: add print_sockaddr_ptr() to handle plain/pointer addrlen

2018-02-17 Thread Laurent Vivier
Le 24/01/2018 à 14:01, Philippe Mathieu-Daudé a écrit :
> since this argument differs between sendto()/recvfrom()
> 
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
>  linux-user/strace.c | 17 +++--
>  1 file changed, 15 insertions(+), 2 deletions(-)
> 
> diff --git a/linux-user/strace.c b/linux-user/strace.c
> index e7272f4ede..9726d9b378 100644
> --- a/linux-user/strace.c
> +++ b/linux-user/strace.c
> @@ -335,12 +335,15 @@ static void print_siginfo(const target_siginfo_t *tinfo)
>  }
>  
>  static void
> -print_sockaddr(abi_ulong addr, abi_long addrlen)
> +print_sockaddr_ptr(abi_ulong addr, abi_long addrlen, bool addrlen_ptr)
>  {
>  struct target_sockaddr *sa;
>  int i;
>  int sa_family;
>  
> +if (addrlen_ptr) {
> +get_user_ual(addrlen, addrlen);
> +}
>  sa = lock_user(VERIFY_READ, addr, addrlen, 1);
>  if (sa) {
>  sa_family = tswap16(sa->sa_family);
> @@ -417,7 +420,17 @@ print_sockaddr(abi_ulong addr, abi_long addrlen)
>  } else {
>  print_raw_param("0x"TARGET_ABI_FMT_lx, addr, 0);
>  }
> -gemu_log(", "TARGET_ABI_FMT_ld, addrlen);
> +if (addrlen_ptr) {
> +gemu_log(", ["TARGET_ABI_FMT_ld"]", addrlen);
> +} else {
> +gemu_log(", "TARGET_ABI_FMT_ld, addrlen);
> +}
> +}
> +
> +static void
> +print_sockaddr(abi_ulong addr, abi_long addrlen)
> +{
> +print_sockaddr_ptr(addr, addrlen, false);
>  }
>  
>  static void
> 

Why not only something like:

static void
print_sockaddr_ptr(abi_ulong addr, abi_long addrlen_ptr)
{
abi_ulong addrlen;

get_user_ual(addrlen, addrlen_ptr);
print_sockaddr(addr, addrlen);
}

?

You should also check addrlen_ptr != NULL (it's allowed with recvfrom()).

Thanks,
Laurent



[Qemu-devel] [PATCH v2 00/67] target/arm: Scalable Vector Extension

2018-02-17 Thread Richard Henderson
This is 99% of the instruction set.  There are a few things missing,
notably first-fault and non-fault loads (even these are decoded, but
simply treated as normal loads for now).

The patch set is dependant on at least 3 other branches.
A fully composed tree is available as

  git://github.com/rth7680/qemu.git tgt-arm-sve-7

There are a few checkpatch errors due to macros and typedefs, but
nothing that isn't be obvious as a false positive.

This is able to run SVE enabled Himeno and LULESH benchmarks as
compiled by last week's gcc-8:

$ ./aarch64-linux-user/qemu-aarch64 ~/himeno-advsimd
mimax = 129 mjmax = 65 mkmax = 65
imax = 128 jmax = 64 kmax =64
cpu : 67.028643 sec.
Loop executed for 200 times
Gosa : 1.688752e-03 
MFLOPS measured : 49.136295
Score based on MMX Pentium 200MHz : 1.522662

$ ./aarch64-linux-user/qemu-aarch64 ~/himeno-sve 
mimax = 129 mjmax = 65 mkmax = 65
imax = 128 jmax = 64 kmax =64
cpu : 43.481213 sec.
Loop executed for 200 times
Gosa : 3.830036e-06 
MFLOPS measured : 75.746259
Score based on MMX Pentium 200MHz : 2.347266

Hopefully the size of the patch set isn't too daunting...


r~


Richard Henderson (67):
  target/arm: Enable SVE for aarch64-linux-user
  target/arm: Introduce translate-a64.h
  target/arm: Add SVE decode skeleton
  target/arm: Implement SVE Bitwise Logical - Unpredicated Group
  target/arm: Implement SVE load vector/predicate
  target/arm: Implement SVE predicate test
  target/arm: Implement SVE Predicate Logical Operations Group
  target/arm: Implement SVE Predicate Misc Group
  target/arm: Implement SVE Integer Binary Arithmetic - Predicated Group
  target/arm: Implement SVE Integer Reduction Group
  target/arm: Implement SVE bitwise shift by immediate (predicated)
  target/arm: Implement SVE bitwise shift by vector (predicated)
  target/arm: Implement SVE bitwise shift by wide elements (predicated)
  target/arm: Implement SVE Integer Arithmetic - Unary Predicated Group
  target/arm: Implement SVE Integer Multiply-Add Group
  target/arm: Implement SVE Integer Arithmetic - Unpredicated Group
  target/arm: Implement SVE Index Generation Group
  target/arm: Implement SVE Stack Allocation Group
  target/arm: Implement SVE Bitwise Shift - Unpredicated Group
  target/arm: Implement SVE Compute Vector Address Group
  target/arm: Implement SVE floating-point exponential accelerator
  target/arm: Implement SVE floating-point trig select coefficient
  target/arm: Implement SVE Element Count Group
  target/arm: Implement SVE Bitwise Immediate Group
  target/arm: Implement SVE Integer Wide Immediate - Predicated Group
  target/arm: Implement SVE Permute - Extract Group
  target/arm: Implement SVE Permute - Unpredicated Group
  target/arm: Implement SVE Permute - Predicates Group
  target/arm: Implement SVE Permute - Interleaving Group
  target/arm: Implement SVE compress active elements
  target/arm: Implement SVE conditionally broadcast/extract element
  target/arm: Implement SVE copy to vector (predicated)
  target/arm: Implement SVE reverse within elements
  target/arm: Implement SVE vector splice (predicated)
  target/arm: Implement SVE Select Vectors Group
  target/arm: Implement SVE Integer Compare - Vectors Group
  target/arm: Implement SVE Integer Compare - Immediate Group
  target/arm: Implement SVE Partition Break Group
  target/arm: Implement SVE Predicate Count Group
  target/arm: Implement SVE Integer Compare - Scalars Group
  target/arm: Implement FDUP/DUP
  target/arm: Implement SVE Integer Wide Immediate - Unpredicated Group
  target/arm: Implement SVE Floating Point Arithmetic - Unpredicated
Group
  target/arm: Implement SVE Memory Contiguous Load Group
  target/arm: Implement SVE Memory Contiguous Store Group
  target/arm: Implement SVE load and broadcast quadword
  target/arm: Implement SVE integer convert to floating-point
  target/arm: Implement SVE floating-point arithmetic (predicated)
  target/arm: Implement SVE FP Multiply-Add Group
  target/arm: Implement SVE Floating Point Accumulating Reduction Group
  target/arm: Implement SVE load and broadcast element
  target/arm: Implement SVE store vector/predicate register
  target/arm: Implement SVE scatter stores
  target/arm: Implement SVE prefetches
  target/arm: Implement SVE gather loads
  target/arm: Implement SVE scatter store vector immediate
  target/arm: Implement SVE floating-point compare vectors
  target/arm: Implement SVE floating-point arithmetic with immediate
  target/arm: Implement SVE Floating Point Multiply Indexed Group
  target/arm: Implement SVE FP Fast Reduction Group
  target/arm: Implement SVE Floating Point Unary Operations -
Unpredicated Group
  target/arm: Implement SVE FP Compare with Zero Group
  target/arm: Implement SVE floating-point trig multiply-add coefficient
  target/arm: Implement SVE floating-point convert precision
  target/arm: Implement SVE floating-point convert to integer
  target/arm: Implement SVE floating-point round to integral value
  target/arm: I

[Qemu-devel] [PATCH v2 01/67] target/arm: Enable SVE for aarch64-linux-user

2018-02-17 Thread Richard Henderson
Enable ARM_FEATURE_SVE for the generic "any" cpu.

Signed-off-by: Richard Henderson 
---
 target/arm/cpu.c   | 7 +++
 target/arm/cpu64.c | 1 +
 2 files changed, 8 insertions(+)

diff --git a/target/arm/cpu.c b/target/arm/cpu.c
index 1b3ae62db6..10843994c3 100644
--- a/target/arm/cpu.c
+++ b/target/arm/cpu.c
@@ -150,6 +150,13 @@ static void arm_cpu_reset(CPUState *s)
 env->cp15.sctlr_el[1] |= SCTLR_UCT | SCTLR_UCI | SCTLR_DZE;
 /* and to the FP/Neon instructions */
 env->cp15.cpacr_el1 = deposit64(env->cp15.cpacr_el1, 20, 2, 3);
+/* and to the SVE instructions */
+env->cp15.cpacr_el1 = deposit64(env->cp15.cpacr_el1, 16, 2, 3);
+env->cp15.cptr_el[3] |= CPTR_EZ;
+/* with maximum vector length */
+env->vfp.zcr_el[1] = ARM_MAX_VQ - 1;
+env->vfp.zcr_el[2] = ARM_MAX_VQ - 1;
+env->vfp.zcr_el[3] = ARM_MAX_VQ - 1;
 #else
 /* Reset into the highest available EL */
 if (arm_feature(env, ARM_FEATURE_EL3)) {
diff --git a/target/arm/cpu64.c b/target/arm/cpu64.c
index efc519b49b..36ef9e9d9d 100644
--- a/target/arm/cpu64.c
+++ b/target/arm/cpu64.c
@@ -231,6 +231,7 @@ static void aarch64_any_initfn(Object *obj)
 set_feature(&cpu->env, ARM_FEATURE_V8_PMULL);
 set_feature(&cpu->env, ARM_FEATURE_CRC);
 set_feature(&cpu->env, ARM_FEATURE_V8_FP16);
+set_feature(&cpu->env, ARM_FEATURE_SVE);
 cpu->ctr = 0x80038003; /* 32 byte I and D cacheline size, VIPT icache */
 cpu->dcz_blocksize = 7; /*  512 bytes */
 }
-- 
2.14.3




[Qemu-devel] [PATCH v2 02/67] target/arm: Introduce translate-a64.h

2018-02-17 Thread Richard Henderson
Move some stuff that will be common to both translate-a64.c
and translate-sve.c.

Signed-off-by: Richard Henderson 
---
 target/arm/translate-a64.h | 110 +
 target/arm/translate-a64.c | 101 ++---
 2 files changed, 123 insertions(+), 88 deletions(-)
 create mode 100644 target/arm/translate-a64.h

diff --git a/target/arm/translate-a64.h b/target/arm/translate-a64.h
new file mode 100644
index 00..e519aee314
--- /dev/null
+++ b/target/arm/translate-a64.h
@@ -0,0 +1,110 @@
+/*
+ *  AArch64 translation, common definitions.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see .
+ */
+
+#ifndef TARGET_ARM_TRANSLATE_A64_H
+#define TARGET_ARM_TRANSLATE_A64_H
+
+void unallocated_encoding(DisasContext *s);
+
+#define unsupported_encoding(s, insn)\
+do { \
+qemu_log_mask(LOG_UNIMP, \
+  "%s:%d: unsupported instruction encoding 0x%08x "  \
+  "at pc=%016" PRIx64 "\n",  \
+  __FILE__, __LINE__, insn, s->pc - 4);  \
+unallocated_encoding(s); \
+} while (0)
+
+TCGv_i64 new_tmp_a64(DisasContext *s);
+TCGv_i64 new_tmp_a64_zero(DisasContext *s);
+TCGv_i64 cpu_reg(DisasContext *s, int reg);
+TCGv_i64 cpu_reg_sp(DisasContext *s, int reg);
+TCGv_i64 read_cpu_reg(DisasContext *s, int reg, int sf);
+TCGv_i64 read_cpu_reg_sp(DisasContext *s, int reg, int sf);
+void write_fp_dreg(DisasContext *s, int reg, TCGv_i64 v);
+TCGv_ptr get_fpstatus_ptr(bool);
+bool logic_imm_decode_wmask(uint64_t *result, unsigned int immn,
+unsigned int imms, unsigned int immr);
+uint64_t vfp_expand_imm(int size, uint8_t imm8);
+
+/* We should have at some point before trying to access an FP register
+ * done the necessary access check, so assert that
+ * (a) we did the check and
+ * (b) we didn't then just plough ahead anyway if it failed.
+ * Print the instruction pattern in the abort message so we can figure
+ * out what we need to fix if a user encounters this problem in the wild.
+ */
+static inline void assert_fp_access_checked(DisasContext *s)
+{
+#ifdef CONFIG_DEBUG_TCG
+if (unlikely(!s->fp_access_checked || s->fp_excp_el)) {
+fprintf(stderr, "target-arm: FP access check missing for "
+"instruction 0x%08x\n", s->insn);
+abort();
+}
+#endif
+}
+
+/* Return the offset into CPUARMState of an element of specified
+ * size, 'element' places in from the least significant end of
+ * the FP/vector register Qn.
+ */
+static inline int vec_reg_offset(DisasContext *s, int regno,
+ int element, TCGMemOp size)
+{
+int offs = 0;
+#ifdef HOST_WORDS_BIGENDIAN
+/* This is complicated slightly because vfp.zregs[n].d[0] is
+ * still the low half and vfp.zregs[n].d[1] the high half
+ * of the 128 bit vector, even on big endian systems.
+ * Calculate the offset assuming a fully bigendian 128 bits,
+ * then XOR to account for the order of the two 64 bit halves.
+ */
+offs += (16 - ((element + 1) * (1 << size)));
+offs ^= 8;
+#else
+offs += element * (1 << size);
+#endif
+offs += offsetof(CPUARMState, vfp.zregs[regno]);
+assert_fp_access_checked(s);
+return offs;
+}
+
+/* Return the offset info CPUARMState of the "whole" vector register Qn.  */
+static inline int vec_full_reg_offset(DisasContext *s, int regno)
+{
+assert_fp_access_checked(s);
+return offsetof(CPUARMState, vfp.zregs[regno]);
+}
+
+/* Return a newly allocated pointer to the vector register.  */
+static inline TCGv_ptr vec_full_reg_ptr(DisasContext *s, int regno)
+{
+TCGv_ptr ret = tcg_temp_new_ptr();
+tcg_gen_addi_ptr(ret, cpu_env, vec_full_reg_offset(s, regno));
+return ret;
+}
+
+/* Return the byte size of the "whole" vector register, VL / 8.  */
+static inline int vec_full_reg_size(DisasContext *s)
+{
+return s->sve_len;
+}
+
+bool disas_sve(DisasContext *, uint32_t);
+
+#endif /* TARGET_ARM_TRANSLATE_A64_H */
diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index 032cbfa17d..e0e7ebf68c 100644
--- a/target/arm/trans

[Qemu-devel] [PATCH v2 15/67] target/arm: Implement SVE Integer Multiply-Add Group

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h| 18 ++
 target/arm/sve_helper.c| 58 +-
 target/arm/translate-sve.c | 31 +
 target/arm/sve.decode  | 17 ++
 4 files changed, 123 insertions(+), 1 deletion(-)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index 11644125d1..b31d497f31 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -345,6 +345,24 @@ DEF_HELPER_FLAGS_4(sve_neg_h, TCG_CALL_NO_RWG, void, ptr, 
ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(sve_neg_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(sve_neg_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_6(sve_mla_b, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_mla_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_mla_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_mla_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_6(sve_mls_b, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_mls_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_mls_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_mls_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+
 DEF_HELPER_FLAGS_5(sve_and_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
 DEF_HELPER_FLAGS_5(sve_bic_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
 DEF_HELPER_FLAGS_5(sve_eor_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index e11823a727..4b08a38ce8 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -932,6 +932,62 @@ DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
 #undef DO_SHR
 #undef DO_SHL
 #undef DO_ASRD
-
 #undef DO_ZPZI
 #undef DO_ZPZI_D
+
+/* Fully general four-operand expander, controlled by a predicate.
+ */
+#define DO_ZPZZZ(NAME, TYPE, H, OP)   \
+void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
+  void *vg, uint32_t desc)\
+{ \
+intptr_t i, opr_sz = simd_oprsz(desc);\
+for (i = 0; i < opr_sz; ) {   \
+uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));   \
+do {  \
+if (pg & 1) { \
+TYPE nn = *(TYPE *)(vn + H(i));   \
+TYPE mm = *(TYPE *)(vm + H(i));   \
+TYPE aa = *(TYPE *)(va + H(i));   \
+*(TYPE *)(vd + H(i)) = OP(aa, nn, mm);\
+} \
+i += sizeof(TYPE), pg >>= sizeof(TYPE);   \
+} while (i & 15); \
+} \
+}
+
+/* Similarly, specialized for 64-bit operands.  */
+#define DO_ZPZZZ_D(NAME, TYPE, OP)\
+void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
+  void *vg, uint32_t desc)\
+{ \
+intptr_t i, opr_sz = simd_oprsz(desc) / 8;\
+TYPE *d = vd, *a = va, *n = vn, *m = vm;  \
+uint8_t *pg = vg; \
+for (i = 0; i < opr_sz; i += 1) { \
+if (pg[H1(i)] & 1) {  \
+TYPE aa = a[i], nn = n[i], mm = m[i]; \
+d[i] = OP(aa, nn, mm);\
+} \
+} \
+}
+
+#define DO_MLA(A, N, M)  (A + N * M)
+#define DO_MLS(A, N, M)  (A - N * M)
+
+DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
+DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
+
+DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
+DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
+
+DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
+DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
+
+DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
+DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
+
+#undef DO_MLA
+#undef DO_MLS
+#undef DO_ZPZZZ
+#undef DO_ZPZZZ_D
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index dce8ba8dc0..b956d87636 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -609,6 +609,37 @@ DO_ZPZW(LSL, lsl)
 
 #undef DO_ZPZW
 
+/*
+ *** SVE Integer Multiply-Add Group
+ */
+
+static void do_zpzz

[Qemu-devel] [PATCH v2 03/67] target/arm: Add SVE decode skeleton

2018-02-17 Thread Richard Henderson
Including only 4, as-yet unimplemented, instruction patterns
so that the whole thing compiles.

Signed-off-by: Richard Henderson 
---
 target/arm/translate-a64.c | 11 +++-
 target/arm/translate-sve.c | 63 ++
 .gitignore |  1 +
 target/arm/Makefile.objs   | 10 
 target/arm/sve.decode  | 45 +
 5 files changed, 129 insertions(+), 1 deletion(-)
 create mode 100644 target/arm/translate-sve.c
 create mode 100644 target/arm/sve.decode

diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index e0e7ebf68c..a50fef98af 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -12772,9 +12772,18 @@ static void disas_a64_insn(CPUARMState *env, 
DisasContext *s)
 s->fp_access_checked = false;
 
 switch (extract32(insn, 25, 4)) {
-case 0x0: case 0x1: case 0x2: case 0x3: /* UNALLOCATED */
+case 0x0: case 0x1: case 0x3: /* UNALLOCATED */
 unallocated_encoding(s);
 break;
+case 0x2:
+if (!arm_dc_feature(s, ARM_FEATURE_SVE)) {
+unallocated_encoding(s);
+} else if (!sve_access_check(s) || !fp_access_check(s)) {
+/* exception raised */
+} else if (!disas_sve(s, insn)) {
+unallocated_encoding(s);
+}
+break;
 case 0x8: case 0x9: /* Data processing - immediate */
 disas_data_proc_imm(s, insn);
 break;
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
new file mode 100644
index 00..2c9e4733cb
--- /dev/null
+++ b/target/arm/translate-sve.c
@@ -0,0 +1,63 @@
+/*
+ * AArch64 SVE translation
+ *
+ * Copyright (c) 2018 Linaro, Ltd
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see .
+ */
+
+#include "qemu/osdep.h"
+#include "cpu.h"
+#include "exec/exec-all.h"
+#include "tcg-op.h"
+#include "tcg-op-gvec.h"
+#include "qemu/log.h"
+#include "arm_ldst.h"
+#include "translate.h"
+#include "internals.h"
+#include "exec/helper-proto.h"
+#include "exec/helper-gen.h"
+#include "exec/log.h"
+#include "trace-tcg.h"
+#include "translate-a64.h"
+
+/*
+ * Include the generated decoder.
+ */
+
+#include "decode-sve.inc.c"
+
+/*
+ * Implement all of the translator functions referenced by the decoder.
+ */
+
+static void trans_AND_zzz(DisasContext *s, arg_AND_zzz *a, uint32_t insn)
+{
+unsupported_encoding(s, insn);
+}
+
+static void trans_ORR_zzz(DisasContext *s, arg_ORR_zzz *a, uint32_t insn)
+{
+unsupported_encoding(s, insn);
+}
+
+static void trans_EOR_zzz(DisasContext *s, arg_EOR_zzz *a, uint32_t insn)
+{
+unsupported_encoding(s, insn);
+}
+
+static void trans_BIC_zzz(DisasContext *s, arg_BIC_zzz *a, uint32_t insn)
+{
+unsupported_encoding(s, insn);
+}
diff --git a/.gitignore b/.gitignore
index 704b22285d..abe2b81a26 100644
--- a/.gitignore
+++ b/.gitignore
@@ -140,3 +140,4 @@ trace-dtrace-root.h
 trace-dtrace-root.dtrace
 trace-ust-all.h
 trace-ust-all.c
+/target/arm/decode-sve.inc.c
diff --git a/target/arm/Makefile.objs b/target/arm/Makefile.objs
index 847fb52ee0..9934cf1d4d 100644
--- a/target/arm/Makefile.objs
+++ b/target/arm/Makefile.objs
@@ -10,3 +10,13 @@ obj-y += gdbstub.o
 obj-$(TARGET_AARCH64) += cpu64.o translate-a64.o helper-a64.o gdbstub64.o
 obj-y += crypto_helper.o
 obj-$(CONFIG_SOFTMMU) += arm-powerctl.o
+
+DECODETREE = $(SRC_PATH)/scripts/decodetree.py
+
+target/arm/decode-sve.inc.c: $(SRC_PATH)/target/arm/sve.decode $(DECODETREE)
+   $(call quiet-command,\
+ $(PYTHON) $(DECODETREE) --decode disas_sve -o $@ $<,\
+ "GEN", $(TARGET_DIR)$@)
+
+target/arm/translate-sve.o: target/arm/decode-sve.inc.c
+obj-$(TARGET_AARCH64) += translate-sve.o
diff --git a/target/arm/sve.decode b/target/arm/sve.decode
new file mode 100644
index 00..2c13a6024a
--- /dev/null
+++ b/target/arm/sve.decode
@@ -0,0 +1,45 @@
+# AArch64 SVE instruction descriptions
+#
+#  Copyright (c) 2017 Linaro, Ltd
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABI

[Qemu-devel] [PATCH v2 05/67] target/arm: Implement SVE load vector/predicate

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/translate-sve.c | 132 +
 target/arm/sve.decode  |  22 +++-
 2 files changed, 153 insertions(+), 1 deletion(-)

diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index 50cf2a1fdd..c0cccfda6f 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -46,6 +46,19 @@ typedef void GVecGen3Fn(unsigned, uint32_t, uint32_t,
  * Implement all of the translator functions referenced by the decoder.
  */
 
+/* Return the offset info CPUARMState of the predicate vector register Pn.
+ * Note for this purpose, FFR is P16.  */
+static inline int pred_full_reg_offset(DisasContext *s, int regno)
+{
+return offsetof(CPUARMState, vfp.pregs[regno]);
+}
+
+/* Return the byte size of the whole predicate register, VL / 64.  */
+static inline int pred_full_reg_size(DisasContext *s)
+{
+return s->sve_len >> 3;
+}
+
 /* Invoke a vector expander on two Zregs.  */
 static void do_vector2_z(DisasContext *s, GVecGen2Fn *gvec_fn,
  int esz, int rd, int rn)
@@ -97,3 +110,122 @@ static void trans_BIC_zzz(DisasContext *s, arg_BIC_zzz *a, 
uint32_t insn)
 {
 do_vector3_z(s, tcg_gen_gvec_andc, 0, a->rd, a->rn, a->rm);
 }
+
+/*
+ *** SVE Memory - 32-bit Gather and Unsized Contiguous Group
+ */
+
+/* Subroutine loading a vector register at VOFS of LEN bytes.
+ * The load should begin at the address Rn + IMM.
+ */
+
+#if UINTPTR_MAX == UINT32_MAX
+# define ptr i32
+#else
+# define ptr i64
+#endif
+
+static void do_ldr(DisasContext *s, uint32_t vofs, uint32_t len,
+   int rn, int imm)
+{
+uint32_t len_align = QEMU_ALIGN_DOWN(len, 8);
+uint32_t len_remain = len % 8;
+uint32_t nparts = len / 8 + ctpop8(len_remain);
+int midx = get_mem_index(s);
+TCGv_i64 addr, t0, t1;
+
+addr = tcg_temp_new_i64();
+t0 = tcg_temp_new_i64();
+
+/* Note that unpredicated load/store of vector/predicate registers
+ * are defined as a stream of bytes, which equates to little-endian
+ * operations on larger quantities.  There is no nice way to force
+ * a little-endian load for aarch64_be-linux-user out of line.
+ *
+ * Attempt to keep code expansion to a minimum by limiting the
+ * amount of unrolling done.
+ */
+if (nparts <= 4) {
+int i;
+
+for (i = 0; i < len_align; i += 8) {
+tcg_gen_addi_i64(addr, cpu_reg_sp(s, rn), imm + i);
+tcg_gen_qemu_ld_i64(t0, addr, midx, MO_LEQ);
+tcg_gen_st_i64(t0, cpu_env, vofs + i);
+}
+} else {
+TCGLabel *loop = gen_new_label();
+TCGv_ptr i = TCGV_NAT_TO_PTR(glue(tcg_const_local_, ptr)(0));
+TCGv_ptr dest;
+
+gen_set_label(loop);
+
+/* Minimize the number of local temps that must be re-read from
+ * the stack each iteration.  Instead, re-compute values other
+ * than the loop counter.
+ */
+dest = tcg_temp_new_ptr();
+tcg_gen_addi_ptr(dest, i, imm);
+#if UINTPTR_MAX == UINT32_MAX
+tcg_gen_extu_i32_i64(addr, TCGV_PTR_TO_NAT(dest));
+tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, rn));
+#else
+tcg_gen_add_i64(addr, TCGV_PTR_TO_NAT(dest), cpu_reg_sp(s, rn));
+#endif
+
+tcg_gen_qemu_ld_i64(t0, addr, midx, MO_LEQ);
+
+tcg_gen_add_ptr(dest, cpu_env, i);
+tcg_gen_addi_ptr(i, i, 8);
+tcg_gen_st_i64(t0, dest, vofs);
+tcg_temp_free_ptr(dest);
+
+glue(tcg_gen_brcondi_, ptr)(TCG_COND_LTU, TCGV_PTR_TO_NAT(i),
+len_align, loop);
+tcg_temp_free_ptr(i);
+}
+
+/* Predicate register loads can be any multiple of 2.
+ * Note that we still store the entire 64-bit unit into cpu_env.
+ */
+if (len_remain) {
+tcg_gen_addi_i64(addr, cpu_reg_sp(s, rn), imm + len_align);
+
+switch (len_remain) {
+case 2:
+case 4:
+case 8:
+tcg_gen_qemu_ld_i64(t0, addr, midx, MO_LE | ctz32(len_remain));
+break;
+
+case 6:
+t1 = tcg_temp_new_i64();
+tcg_gen_qemu_ld_i64(t0, addr, midx, MO_LEUL);
+tcg_gen_addi_i64(addr, addr, 4);
+tcg_gen_qemu_ld_i64(t1, addr, midx, MO_LEUW);
+tcg_gen_deposit_i64(t0, t0, t1, 32, 32);
+tcg_temp_free_i64(t1);
+break;
+
+default:
+g_assert_not_reached();
+}
+tcg_gen_st_i64(t0, cpu_env, vofs + len_align);
+}
+tcg_temp_free_i64(addr);
+tcg_temp_free_i64(t0);
+}
+
+#undef ptr
+
+static void trans_LDR_zri(DisasContext *s, arg_rri *a, uint32_t insn)
+{
+int size = vec_full_reg_size(s);
+do_ldr(s, vec_full_reg_offset(s, a->rd), size, a->rn, a->imm * size);
+}
+
+static void trans_LDR_pri(DisasContext *s, arg_rri *a, uint32_t insn)
+{
+int size = pred_full_reg_size(s);
+do_ldr(s, pred_full_reg_offset(s, a->rd), size, a->rn, a

[Qemu-devel] [PATCH v2 07/67] target/arm: Implement SVE Predicate Logical Operations Group

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/cpu.h   |   4 +-
 target/arm/helper-sve.h|  10 ++
 target/arm/sve_helper.c|  39 ++
 target/arm/translate-sve.c | 338 -
 target/arm/sve.decode  |  16 +++
 5 files changed, 405 insertions(+), 2 deletions(-)

diff --git a/target/arm/cpu.h b/target/arm/cpu.h
index 70e05f00fe..8befe43a01 100644
--- a/target/arm/cpu.h
+++ b/target/arm/cpu.h
@@ -527,6 +527,8 @@ typedef struct CPUARMState {
 #ifdef TARGET_AARCH64
 /* Store FFR as pregs[16] to make it easier to treat as any other.  */
 ARMPredicateReg pregs[17];
+/* Scratch space for aa64 sve predicate temporary.  */
+ARMPredicateReg preg_tmp;
 #endif
 
 uint32_t xregs[16];
@@ -534,7 +536,7 @@ typedef struct CPUARMState {
 int vec_len;
 int vec_stride;
 
-/* scratch space when Tn are not sufficient.  */
+/* Scratch space for aa32 neon expansion.  */
 uint32_t scratch[8];
 
 /* There are a number of distinct float control structures:
diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index b6e91539ae..57adc4d912 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -19,3 +19,13 @@
 
 DEF_HELPER_FLAGS_2(sve_predtest1, TCG_CALL_NO_WG, i32, i64, i64)
 DEF_HELPER_FLAGS_3(sve_predtest, TCG_CALL_NO_WG, i32, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve_and_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
+DEF_HELPER_FLAGS_5(sve_bic_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
+DEF_HELPER_FLAGS_5(sve_eor_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
+DEF_HELPER_FLAGS_5(sve_sel_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
+DEF_HELPER_FLAGS_5(sve_orr_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
+DEF_HELPER_FLAGS_5(sve_orn_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
+DEF_HELPER_FLAGS_5(sve_nor_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
+DEF_HELPER_FLAGS_5(sve_nand_, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index 7d13fd40ed..b63e7cc90e 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -75,3 +75,42 @@ uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t 
words)
 
 return flags;
 }
+
+#define LOGICAL_(NAME, FUNC) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)  \
+{ \
+uintptr_t opr_sz = simd_oprsz(desc);  \
+uint64_t *d = vd, *n = vn, *m = vm, *g = vg;  \
+uintptr_t i;  \
+for (i = 0; i < opr_sz / 8; ++i) {\
+d[i] = FUNC(n[i], m[i], g[i]);\
+} \
+}
+
+#define DO_AND(N, M, G)  (((N) & (M)) & (G))
+#define DO_BIC(N, M, G)  (((N) & ~(M)) & (G))
+#define DO_EOR(N, M, G)  (((N) ^ (M)) & (G))
+#define DO_ORR(N, M, G)  (((N) | (M)) & (G))
+#define DO_ORN(N, M, G)  (((N) | ~(M)) & (G))
+#define DO_NOR(N, M, G)  (~((N) | (M)) & (G))
+#define DO_NAND(N, M, G) (~((N) & (M)) & (G))
+#define DO_SEL(N, M, G)  (((N) & (G)) | ((M) & ~(G)))
+
+LOGICAL_(sve_and_, DO_AND)
+LOGICAL_(sve_bic_, DO_BIC)
+LOGICAL_(sve_eor_, DO_EOR)
+LOGICAL_(sve_sel_, DO_SEL)
+LOGICAL_(sve_orr_, DO_ORR)
+LOGICAL_(sve_orn_, DO_ORN)
+LOGICAL_(sve_nor_, DO_NOR)
+LOGICAL_(sve_nand_, DO_NAND)
+
+#undef DO_ADD
+#undef DO_BIC
+#undef DO_EOR
+#undef DO_ORR
+#undef DO_ORN
+#undef DO_NOR
+#undef DO_NAND
+#undef DO_SEL
+#undef LOGICAL_
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index c2e7fac938..405f9397a1 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -59,6 +59,24 @@ static inline int pred_full_reg_size(DisasContext *s)
 return s->sve_len >> 3;
 }
 
+/* Round up the size of a predicate register to a size allowed by
+ * the tcg vector infrastructure.  Any operation which uses this
+ * size may assume that the bits above pred_full_reg_size are zero,
+ * and must leave them the same way.
+ *
+ * Note that this is not needed for the vector registers as they
+ * are always properly sized for tcg vectors.
+ */
+static int pred_gvec_reg_size(DisasContext *s)
+{
+int size = pred_full_reg_size(s);
+if (size <= 8) {
+return 8;
+} else {
+return QEMU_ALIGN_UP(size, 16);
+}
+}
+
 /* Invoke a vector expander on two Zregs.  */
 static void do_vector2_z(DisasContext *s, GVecGen2Fn *gvec_fn,
  int esz, int rd, int rn)
@@ -83,6 +101,40 @@ static void do_mov_z(DisasContext *s, int rd, int rn)
 do_vector2_z(s, tcg_gen_gvec_mov, 0, rd, rn);
 }
 
+/* Invoke a vector expander 

[Qemu-devel] [PATCH v2 16/67] target/arm: Implement SVE Integer Arithmetic - Unpredicated Group

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/translate-sve.c | 41 ++---
 target/arm/sve.decode  | 13 +
 2 files changed, 51 insertions(+), 3 deletions(-)

diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index b956d87636..8baec6c674 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -235,6 +235,40 @@ static void trans_BIC_zzz(DisasContext *s, arg_BIC_zzz *a, 
uint32_t insn)
 do_vector3_z(s, tcg_gen_gvec_andc, 0, a->rd, a->rn, a->rm);
 }
 
+/*
+ *** SVE Integer Arithmetic - Unpredicated Group
+ */
+
+static void trans_ADD_zzz(DisasContext *s, arg_rrr_esz *a, uint32_t insn)
+{
+do_vector3_z(s, tcg_gen_gvec_add, a->esz, a->rd, a->rn, a->rm);
+}
+
+static void trans_SUB_zzz(DisasContext *s, arg_rrr_esz *a, uint32_t insn)
+{
+do_vector3_z(s, tcg_gen_gvec_sub, a->esz, a->rd, a->rn, a->rm);
+}
+
+static void trans_SQADD_zzz(DisasContext *s, arg_rrr_esz *a, uint32_t insn)
+{
+do_vector3_z(s, tcg_gen_gvec_ssadd, a->esz, a->rd, a->rn, a->rm);
+}
+
+static void trans_SQSUB_zzz(DisasContext *s, arg_rrr_esz *a, uint32_t insn)
+{
+do_vector3_z(s, tcg_gen_gvec_sssub, a->esz, a->rd, a->rn, a->rm);
+}
+
+static void trans_UQADD_zzz(DisasContext *s, arg_rrr_esz *a, uint32_t insn)
+{
+do_vector3_z(s, tcg_gen_gvec_usadd, a->esz, a->rd, a->rn, a->rm);
+}
+
+static void trans_UQSUB_zzz(DisasContext *s, arg_rrr_esz *a, uint32_t insn)
+{
+do_vector3_z(s, tcg_gen_gvec_ussub, a->esz, a->rd, a->rn, a->rm);
+}
+
 /*
  *** SVE Integer Arithmetic - Binary Predicated Group
  */
@@ -254,7 +288,8 @@ static void do_zpzz_ool(DisasContext *s, arg_rprr_esz *a, 
gen_helper_gvec_4 *fn)
 }
 
 #define DO_ZPZZ(NAME, name) \
-void trans_##NAME##_zpzz(DisasContext *s, arg_rprr_esz *a, uint32_t insn) \
+static void trans_##NAME##_zpzz(DisasContext *s, arg_rprr_esz *a, \
+uint32_t insn)\
 { \
 static gen_helper_gvec_4 * const fns[4] = {   \
 gen_helper_sve_##name##_zpzz_b, gen_helper_sve_##name##_zpzz_h,   \
@@ -286,7 +321,7 @@ DO_ZPZZ(ASR, asr)
 DO_ZPZZ(LSR, lsr)
 DO_ZPZZ(LSL, lsl)
 
-void trans_SDIV_zpzz(DisasContext *s, arg_rprr_esz *a, uint32_t insn)
+static void trans_SDIV_zpzz(DisasContext *s, arg_rprr_esz *a, uint32_t insn)
 {
 static gen_helper_gvec_4 * const fns[4] = {
 NULL, NULL, gen_helper_sve_sdiv_zpzz_s, gen_helper_sve_sdiv_zpzz_d
@@ -294,7 +329,7 @@ void trans_SDIV_zpzz(DisasContext *s, arg_rprr_esz *a, 
uint32_t insn)
 do_zpzz_ool(s, a, fns[a->esz]);
 }
 
-void trans_UDIV_zpzz(DisasContext *s, arg_rprr_esz *a, uint32_t insn)
+static void trans_UDIV_zpzz(DisasContext *s, arg_rprr_esz *a, uint32_t insn)
 {
 static gen_helper_gvec_4 * const fns[4] = {
 NULL, NULL, gen_helper_sve_udiv_zpzz_s, gen_helper_sve_udiv_zpzz_d
diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index 68a1823b72..b40d7dc9a2 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -68,6 +68,9 @@
 # Three prediate operand, with governing predicate, flag setting
 @pd_pg_pn_pm_s  . s:1 .. rm:4 .. pg:4 . rn:4 . rd:4&rprr_s
 
+# Three operand, vector element size
+@rd_rn_rm   esz:2 . rm:5  ... ...  rn:5 rd:5   &rrr_esz
+
 # Two register operand, with governing predicate, vector element size
 @rdn_pg_rm  esz:2 ... ... ... pg:3 rm:5 rd:5 \
&rprr_esz rn=%reg_movprfx
@@ -205,6 +208,16 @@ MLS0100 .. 0 . 011 ... . . 
  @rda_pg_rn_rm
 MLA0100 .. 0 . 110 ... . .   @rdn_pg_ra_rm # MAD
 MLS0100 .. 0 . 111 ... . .   @rdn_pg_ra_rm # MSB
 
+### SVE Integer Arithmetic - Unpredicated Group
+
+# SVE integer add/subtract vectors (unpredicated)
+ADD_zzz0100 .. 1 . 000 000 . . 
@rd_rn_rm
+SUB_zzz0100 .. 1 . 000 001 . . 
@rd_rn_rm
+SQADD_zzz  0100 .. 1 . 000 100 . . @rd_rn_rm
+UQADD_zzz  0100 .. 1 . 000 101 . . @rd_rn_rm
+SQSUB_zzz  0100 .. 1 . 000 110 . . @rd_rn_rm
+UQSUB_zzz  0100 .. 1 . 000 111 . . @rd_rn_rm
+
 ### SVE Logical - Unpredicated Group
 
 # SVE bitwise logical operations (unpredicated)
-- 
2.14.3




[Qemu-devel] [PATCH v2 04/67] target/arm: Implement SVE Bitwise Logical - Unpredicated Group

2018-02-17 Thread Richard Henderson
These were the instructions that were stubbed out when
introducing the decode skeleton.

Signed-off-by: Richard Henderson 
---
 target/arm/translate-sve.c | 50 +++---
 1 file changed, 43 insertions(+), 7 deletions(-)

diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index 2c9e4733cb..50cf2a1fdd 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -32,6 +32,10 @@
 #include "trace-tcg.h"
 #include "translate-a64.h"
 
+typedef void GVecGen2Fn(unsigned, uint32_t, uint32_t, uint32_t, uint32_t);
+typedef void GVecGen3Fn(unsigned, uint32_t, uint32_t,
+uint32_t, uint32_t, uint32_t);
+
 /*
  * Include the generated decoder.
  */
@@ -42,22 +46,54 @@
  * Implement all of the translator functions referenced by the decoder.
  */
 
-static void trans_AND_zzz(DisasContext *s, arg_AND_zzz *a, uint32_t insn)
+/* Invoke a vector expander on two Zregs.  */
+static void do_vector2_z(DisasContext *s, GVecGen2Fn *gvec_fn,
+ int esz, int rd, int rn)
 {
-unsupported_encoding(s, insn);
+unsigned vsz = vec_full_reg_size(s);
+gvec_fn(esz, vec_full_reg_offset(s, rd),
+vec_full_reg_offset(s, rn), vsz, vsz);
 }
 
-static void trans_ORR_zzz(DisasContext *s, arg_ORR_zzz *a, uint32_t insn)
+/* Invoke a vector expander on three Zregs.  */
+static void do_vector3_z(DisasContext *s, GVecGen3Fn *gvec_fn,
+ int esz, int rd, int rn, int rm)
 {
-unsupported_encoding(s, insn);
+unsigned vsz = vec_full_reg_size(s);
+gvec_fn(esz, vec_full_reg_offset(s, rd), vec_full_reg_offset(s, rn),
+vec_full_reg_offset(s, rm), vsz, vsz);
 }
 
-static void trans_EOR_zzz(DisasContext *s, arg_EOR_zzz *a, uint32_t insn)
+/* Invoke a vector move on two Zregs.  */
+static void do_mov_z(DisasContext *s, int rd, int rn)
 {
-unsupported_encoding(s, insn);
+do_vector2_z(s, tcg_gen_gvec_mov, 0, rd, rn);
+}
+
+/*
+ *** SVE Logical - Unpredicated Group
+ */
+
+static void trans_AND_zzz(DisasContext *s, arg_rrr_esz *a, uint32_t insn)
+{
+do_vector3_z(s, tcg_gen_gvec_and, 0, a->rd, a->rn, a->rm);
+}
+
+static void trans_ORR_zzz(DisasContext *s, arg_rrr_esz *a, uint32_t insn)
+{
+if (a->rn == a->rm) { /* MOV */
+do_mov_z(s, a->rd, a->rn);
+} else {
+do_vector3_z(s, tcg_gen_gvec_or, 0, a->rd, a->rn, a->rm);
+}
+}
+
+static void trans_EOR_zzz(DisasContext *s, arg_rrr_esz *a, uint32_t insn)
+{
+do_vector3_z(s, tcg_gen_gvec_xor, 0, a->rd, a->rn, a->rm);
 }
 
 static void trans_BIC_zzz(DisasContext *s, arg_BIC_zzz *a, uint32_t insn)
 {
-unsupported_encoding(s, insn);
+do_vector3_z(s, tcg_gen_gvec_andc, 0, a->rd, a->rn, a->rm);
 }
-- 
2.14.3




[Qemu-devel] [PATCH v2 06/67] target/arm: Implement SVE predicate test

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h| 21 +
 target/arm/helper.h|  1 +
 target/arm/sve_helper.c| 77 ++
 target/arm/translate-sve.c | 62 +
 target/arm/Makefile.objs   |  2 +-
 target/arm/sve.decode  |  5 +++
 6 files changed, 167 insertions(+), 1 deletion(-)
 create mode 100644 target/arm/helper-sve.h
 create mode 100644 target/arm/sve_helper.c

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
new file mode 100644
index 00..b6e91539ae
--- /dev/null
+++ b/target/arm/helper-sve.h
@@ -0,0 +1,21 @@
+/*
+ *  AArch64 SVE specific helper definitions
+ *
+ *  Copyright (c) 2018 Linaro, Ltd
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see .
+ */
+
+DEF_HELPER_FLAGS_2(sve_predtest1, TCG_CALL_NO_WG, i32, i64, i64)
+DEF_HELPER_FLAGS_3(sve_predtest, TCG_CALL_NO_WG, i32, ptr, ptr, i32)
diff --git a/target/arm/helper.h b/target/arm/helper.h
index 6dd8504ec3..be3c2fcdc0 100644
--- a/target/arm/helper.h
+++ b/target/arm/helper.h
@@ -567,4 +567,5 @@ DEF_HELPER_FLAGS_2(neon_pmull_64_hi, TCG_CALL_NO_RWG_SE, 
i64, i64, i64)
 
 #ifdef TARGET_AARCH64
 #include "helper-a64.h"
+#include "helper-sve.h"
 #endif
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
new file mode 100644
index 00..7d13fd40ed
--- /dev/null
+++ b/target/arm/sve_helper.c
@@ -0,0 +1,77 @@
+/*
+ *  ARM SVE Operations
+ *
+ *  Copyright (c) 2018 Linaro
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see .
+ */
+
+#include "qemu/osdep.h"
+#include "cpu.h"
+#include "exec/exec-all.h"
+#include "exec/cpu_ldst.h"
+#include "exec/helper-proto.h"
+#include "tcg/tcg-gvec-desc.h"
+
+
+/* Return a value for NZCV as per the ARM PredTest pseudofunction.
+ *
+ * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
+ * and bit 0 set if C is set.
+ *
+ * This is an iterative function, called for each Pd and Pg word
+ * moving forward.
+ */
+
+/* For no G bits set, NZCV = C.  */
+#define PREDTEST_INIT  1
+
+static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
+{
+if (g) {
+/* Compute N from first D & G.
+   Use bit 2 to signal first G bit seen.  */
+if (!(flags & 4)) {
+flags |= ((d & (g & -g)) != 0) << 31;
+flags |= 4;
+}
+
+/* Accumulate Z from each D & G.  */
+flags |= ((d & g) != 0) << 1;
+
+/* Compute C from last !(D & G).  Replace previous.  */
+flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
+}
+return flags;
+}
+
+/* The same for a single word predicate.  */
+uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
+{
+return iter_predtest_fwd(d, g, PREDTEST_INIT);
+}
+
+/* The same for a multi-word predicate.  */
+uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
+{
+uint32_t flags = PREDTEST_INIT;
+uint64_t *d = vd, *g = vg;
+uintptr_t i = 0;
+
+do {
+flags = iter_predtest_fwd(d[i], g[i], flags);
+} while (++i < words);
+
+return flags;
+}
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index c0cccfda6f..c2e7fac938 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -83,6 +83,43 @@ static void do_mov_z(DisasContext *s, int rd, int rn)
 do_vector2_z(s, tcg_gen_gvec_mov, 0, rd, rn);
 }
 
+/* Set the cpu flags as per a return from an SVE helper.  */
+static void do_pred_flags(TCGv_i32 t)
+{
+tcg_gen_mov_i32(cpu_NF, t);
+tcg_gen_andi_i32(cpu_ZF, t, 2);
+tcg_gen_andi_i32(cpu_CF, t, 1);
+tcg_gen_movi_i32(cpu_VF, 0);
+}
+
+/* Subroutines computing the ARM PredTest psuedofunction.  */
+static void do_predtes

[Qemu-devel] [PATCH v2 09/67] target/arm: Implement SVE Integer Binary Arithmetic - Predicated Group

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h| 145 +
 target/arm/sve_helper.c| 196 -
 target/arm/translate-sve.c |  65 +++
 target/arm/sve.decode  |  42 ++
 4 files changed, 447 insertions(+), 1 deletion(-)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index 0c04afff8c..5b82ba1501 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -23,6 +23,151 @@ DEF_HELPER_FLAGS_3(sve_predtest, TCG_CALL_NO_WG, i32, ptr, 
ptr, i32)
 DEF_HELPER_FLAGS_3(sve_pfirst, TCG_CALL_NO_WG, i32, ptr, ptr, i32)
 DEF_HELPER_FLAGS_3(sve_pnext, TCG_CALL_NO_WG, i32, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_5(sve_and_zpzz_b, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_and_zpzz_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_and_zpzz_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_and_zpzz_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve_eor_zpzz_b, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_eor_zpzz_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_eor_zpzz_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_eor_zpzz_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve_orr_zpzz_b, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_orr_zpzz_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_orr_zpzz_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_orr_zpzz_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve_bic_zpzz_b, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_bic_zpzz_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_bic_zpzz_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_bic_zpzz_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve_add_zpzz_b, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_add_zpzz_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_add_zpzz_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_add_zpzz_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve_sub_zpzz_b, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_sub_zpzz_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_sub_zpzz_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_sub_zpzz_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve_smax_zpzz_b, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_smax_zpzz_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_smax_zpzz_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_smax_zpzz_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve_umax_zpzz_b, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_umax_zpzz_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_umax_zpzz_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_umax_zpzz_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve_smin_zpzz_b, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_smin_zpzz_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_smin_zpzz_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_smin_zpzz_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve_umin_zpzz_b, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_umin_zpzz_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_umin_zpzz_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_umin_zpzz_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve_sabd_zpzz_b, TCG_CALL_NO_RWG,
+   void, ptr,

[Qemu-devel] [PATCH v2 08/67] target/arm: Implement SVE Predicate Misc Group

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/cpu.h   |   3 +
 target/arm/helper-sve.h|   3 +
 target/arm/sve_helper.c|  86 +++-
 target/arm/translate-sve.c | 163 -
 target/arm/sve.decode  |  41 
 5 files changed, 293 insertions(+), 3 deletions(-)

diff --git a/target/arm/cpu.h b/target/arm/cpu.h
index 8befe43a01..27f395183b 100644
--- a/target/arm/cpu.h
+++ b/target/arm/cpu.h
@@ -2915,4 +2915,7 @@ static inline uint64_t *aa64_vfp_qreg(CPUARMState *env, 
unsigned regno)
 return &env->vfp.zregs[regno].d[0];
 }
 
+/* Shared between translate-sve.c and sve_helper.c.  */
+extern const uint64_t pred_esz_masks[4];
+
 #endif
diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index 57adc4d912..0c04afff8c 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -20,6 +20,9 @@
 DEF_HELPER_FLAGS_2(sve_predtest1, TCG_CALL_NO_WG, i32, i64, i64)
 DEF_HELPER_FLAGS_3(sve_predtest, TCG_CALL_NO_WG, i32, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_3(sve_pfirst, TCG_CALL_NO_WG, i32, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_pnext, TCG_CALL_NO_WG, i32, ptr, ptr, i32)
+
 DEF_HELPER_FLAGS_5(sve_and_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
 DEF_HELPER_FLAGS_5(sve_bic_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
 DEF_HELPER_FLAGS_5(sve_eor_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index b63e7cc90e..cee7d9bcf6 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -39,7 +39,7 @@
 
 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
 {
-if (g) {
+if (likely(g)) {
 /* Compute N from first D & G.
Use bit 2 to signal first G bit seen.  */
 if (!(flags & 4)) {
@@ -114,3 +114,87 @@ LOGICAL_(sve_nand_, DO_NAND)
 #undef DO_NAND
 #undef DO_SEL
 #undef LOGICAL_
+
+/* Similar to the ARM LastActiveElement pseudocode function, except the
+   result is multiplied by the element size.  This includes the not found
+   indication; e.g. not found for esz=3 is -8.  */
+static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
+{
+uint64_t mask = pred_esz_masks[esz];
+intptr_t i = words;
+
+do {
+uint64_t this_g = g[--i] & mask;
+if (this_g) {
+return i * 64 + (63 - clz64(this_g));
+}
+} while (i > 0);
+return (intptr_t)-1 << esz;
+}
+
+uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t words)
+{
+uint32_t flags = PREDTEST_INIT;
+uint64_t *d = vd, *g = vg;
+intptr_t i = 0;
+
+do {
+uint64_t this_d = d[i];
+uint64_t this_g = g[i];
+
+if (this_g) {
+if (!(flags & 4)) {
+/* Set in D the first bit of G.  */
+this_d |= this_g & -this_g;
+d[i] = this_d;
+}
+flags = iter_predtest_fwd(this_d, this_g, flags);
+}
+} while (++i < words);
+
+return flags;
+}
+
+uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
+{
+intptr_t words = extract32(pred_desc, 0, SIMD_OPRSZ_BITS);
+intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
+uint32_t flags = PREDTEST_INIT;
+uint64_t *d = vd, *g = vg, esz_mask;
+intptr_t i, next;
+
+next = last_active_element(vd, words, esz) + (1 << esz);
+esz_mask = pred_esz_masks[esz];
+
+/* Similar to the pseudocode for pnext, but scaled by ESZ
+   so that we find the correct bit.  */
+if (next < words * 64) {
+uint64_t mask = -1;
+
+if (next & 63) {
+mask = ~((1ull << (next & 63)) - 1);
+next &= -64;
+}
+do {
+uint64_t this_g = g[next / 64] & esz_mask & mask;
+if (this_g != 0) {
+next = (next & -64) + ctz64(this_g);
+break;
+}
+next += 64;
+mask = -1;
+} while (next < words * 64);
+}
+
+i = 0;
+do {
+uint64_t this_d = 0;
+if (i == next / 64) {
+this_d = 1ull << (next & 63);
+}
+d[i] = this_d;
+flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
+} while (++i < words);
+
+return flags;
+}
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index 405f9397a1..a9b6ae046d 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -22,6 +22,7 @@
 #include "exec/exec-all.h"
 #include "tcg-op.h"
 #include "tcg-op-gvec.h"
+#include "tcg-gvec-desc.h"
 #include "qemu/log.h"
 #include "arm_ldst.h"
 #include "translate.h"
@@ -67,9 +68,8 @@ static inline int pred_full_reg_size(DisasContext *s)
  * Note that this is not needed for the vector registers as they
  * are always properly sized for tcg vectors.
  */
-static int pred_gvec_reg_size(DisasContext *s)
+static int size_for_gvec(int size)
 {
-int size = pred_f

[Qemu-devel] [PATCH v2 27/67] target/arm: Implement SVE Permute - Unpredicated Group

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h|  23 +
 target/arm/translate-a64.h |  14 +++---
 target/arm/sve_helper.c| 114 +
 target/arm/translate-sve.c | 113 
 target/arm/sve.decode  |  29 +++-
 5 files changed, 285 insertions(+), 8 deletions(-)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index 94f4356ce9..0c9aad575e 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -416,6 +416,29 @@ DEF_HELPER_FLAGS_4(sve_cpy_z_d, TCG_CALL_NO_RWG, void, 
ptr, ptr, i64, i32)
 
 DEF_HELPER_FLAGS_4(sve_ext, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_4(sve_insr_b, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(sve_insr_h, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(sve_insr_s, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(sve_insr_d, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+
+DEF_HELPER_FLAGS_3(sve_rev_b, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_rev_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_rev_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_rev_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve_tbl_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_tbl_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_tbl_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_tbl_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(sve_sunpk_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_sunpk_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_sunpk_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(sve_uunpk_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_uunpk_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_uunpk_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
 DEF_HELPER_FLAGS_5(sve_and_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
 DEF_HELPER_FLAGS_5(sve_bic_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
 DEF_HELPER_FLAGS_5(sve_eor_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
diff --git a/target/arm/translate-a64.h b/target/arm/translate-a64.h
index e519aee314..328aa7fce1 100644
--- a/target/arm/translate-a64.h
+++ b/target/arm/translate-a64.h
@@ -66,18 +66,18 @@ static inline void assert_fp_access_checked(DisasContext *s)
 static inline int vec_reg_offset(DisasContext *s, int regno,
  int element, TCGMemOp size)
 {
-int offs = 0;
+int element_size = 1 << size;
+int offs = element * element_size;
 #ifdef HOST_WORDS_BIGENDIAN
 /* This is complicated slightly because vfp.zregs[n].d[0] is
  * still the low half and vfp.zregs[n].d[1] the high half
  * of the 128 bit vector, even on big endian systems.
- * Calculate the offset assuming a fully bigendian 128 bits,
- * then XOR to account for the order of the two 64 bit halves.
+ * Calculate the offset assuming a fully little-endian 128 bits,
+ * then XOR to account for the order of the 64 bit units.
  */
-offs += (16 - ((element + 1) * (1 << size)));
-offs ^= 8;
-#else
-offs += element * (1 << size);
+if (element_size < 8) {
+offs ^= 8 - element_size;
+}
 #endif
 offs += offsetof(CPUARMState, vfp.zregs[regno]);
 assert_fp_access_checked(s);
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index fb3f54300b..466a209c1e 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -1550,3 +1550,117 @@ void HELPER(sve_ext)(void *vd, void *vn, void *vm, 
uint32_t desc)
 memcpy(vd + n_siz, &tmp, n_ofs);
 }
 }
+
+#define DO_INSR(NAME, TYPE, H) \
+void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
+{  \
+intptr_t opr_sz = simd_oprsz(desc);\
+swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE));\
+*(TYPE *)(vd + H(0)) = val;\
+}
+
+DO_INSR(sve_insr_b, uint8_t, H1)
+DO_INSR(sve_insr_h, uint16_t, H1_2)
+DO_INSR(sve_insr_s, uint32_t, H1_4)
+DO_INSR(sve_insr_d, uint64_t, )
+
+#undef DO_INSR
+
+void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
+{
+intptr_t i, j, opr_sz = simd_oprsz(desc);
+for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
+uint64_t f = *(uint64_t *)(vn + i);
+uint64_t b = *(uint64_t *)(vn + j);
+*(uint64_t *)(vd + i) = bswap64(b);
+*(uint64_t *)(vd + j) = bswap64(f);
+}
+}
+
+static inline uint64_t hswap64(uint64_t h)
+{
+uint64_t m = 0xull;
+h = rol64(h, 32);
+return ((h & m) << 16) | ((h >> 16) & m);
+}
+
+void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
+{
+intptr_t i, j, opr_sz = simd_oprs

[Qemu-devel] [PATCH v2 12/67] target/arm: Implement SVE bitwise shift by vector (predicated)

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h| 27 +++
 target/arm/sve_helper.c| 25 +
 target/arm/translate-sve.c |  4 
 target/arm/sve.decode  |  8 
 4 files changed, 64 insertions(+)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index b3c89579af..0cc02ee59e 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -168,6 +168,33 @@ DEF_HELPER_FLAGS_5(sve_udiv_zpzz_s, TCG_CALL_NO_RWG,
 DEF_HELPER_FLAGS_5(sve_udiv_zpzz_d, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_5(sve_asr_zpzz_b, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_asr_zpzz_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_asr_zpzz_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_asr_zpzz_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve_lsr_zpzz_b, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_lsr_zpzz_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_lsr_zpzz_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_lsr_zpzz_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve_lsl_zpzz_b, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_lsl_zpzz_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_lsl_zpzz_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_lsl_zpzz_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
 DEF_HELPER_FLAGS_3(sve_orv_b, TCG_CALL_NO_RWG, i64, ptr, ptr, i32)
 DEF_HELPER_FLAGS_3(sve_orv_h, TCG_CALL_NO_RWG, i64, ptr, ptr, i32)
 DEF_HELPER_FLAGS_3(sve_orv_s, TCG_CALL_NO_RWG, i64, ptr, ptr, i32)
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index b1a170fd70..6ea806d12b 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -439,6 +439,28 @@ DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_DIV)
 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_DIV)
 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_DIV)
 
+/* Note that all bits of the shift are significant
+   and not modulo the element size.  */
+#define DO_ASR(N, M)  (N >> MIN(M, sizeof(N) * 8 - 1))
+#define DO_LSR(N, M)  (M < sizeof(N) * 8 ? N >> M : 0)
+#define DO_LSL(N, M)  (M < sizeof(N) * 8 ? N << M : 0)
+
+DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
+DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
+DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
+
+DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
+DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
+DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
+
+DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
+DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
+DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
+
+DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
+DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
+DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
+
 #undef DO_ZPZZ
 #undef DO_ZPZZ_D
 
@@ -543,6 +565,9 @@ DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
 #undef DO_ABD
 #undef DO_MUL
 #undef DO_DIV
+#undef DO_ASR
+#undef DO_LSR
+#undef DO_LSL
 
 /* Similar to the ARM LastActiveElement pseudocode function, except the
result is multiplied by the element size.  This includes the not found
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index 4218300960..08c56e55a0 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -282,6 +282,10 @@ DO_ZPZZ(MUL, mul)
 DO_ZPZZ(SMULH, smulh)
 DO_ZPZZ(UMULH, umulh)
 
+DO_ZPZZ(ASR, asr)
+DO_ZPZZ(LSR, lsr)
+DO_ZPZZ(LSL, lsl)
+
 void trans_SDIV_zpzz(DisasContext *s, arg_rprr_esz *a, uint32_t insn)
 {
 static gen_helper_gvec_4 * const fns[4] = {
diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index c265ff9899..7ddff8e6bb 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -151,6 +151,14 @@ LSL_zpzi   0100 .. 000 011 100 ... .. ... . \
 ASRD   0100 .. 000 100 100 ... .. ... . \
@rdn_pg_tszimm imm=%tszimm_shr
 
+# SVE bitwise shift by vector (predicated)
+ASR_zpzz   0100 .. 010 000 100 ... . .   @rdn_pg_rm
+LSR_zpzz   0100 .. 010 001 100 ... . .   @rdn_pg_rm
+LSL_zpzz   0100 .. 010 011 100 ... . .   @rdn_pg_rm
+ASR_zpzz   0100 .. 010 100 100 ... . .   @rdm_pg_rn # ASRR
+LSR_zpzz   0100 .. 010 101 100 ... . .   @rdm_pg_rn # LSRR
+LSL_zpzz   0100 .. 010 111 100 ... . .   @rdm_pg_rn # LSLR
+
 ### SVE Logical - Unpredicated Group
 
 # SVE bitwise logical operations (unpredicated)
-- 
2.14.3




[Qemu-devel] [PATCH v2 18/67] target/arm: Implement SVE Stack Allocation Group

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/translate-sve.c | 24 
 target/arm/sve.decode  | 12 
 2 files changed, 36 insertions(+)

diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index 773f0bfded..4a38020c8a 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -742,6 +742,30 @@ static void trans_INDEX_rr(DisasContext *s, arg_INDEX_rr 
*a, uint32_t insn)
 do_index(s, a->esz, a->rd, start, incr);
 }
 
+/*
+ *** SVE Stack Allocation Group
+ */
+
+static void trans_ADDVL(DisasContext *s, arg_ADDVL *a, uint32_t insn)
+{
+TCGv_i64 rd = cpu_reg_sp(s, a->rd);
+TCGv_i64 rn = cpu_reg_sp(s, a->rn);
+tcg_gen_addi_i64(rd, rn, a->imm * vec_full_reg_size(s));
+}
+
+static void trans_ADDPL(DisasContext *s, arg_ADDPL *a, uint32_t insn)
+{
+TCGv_i64 rd = cpu_reg_sp(s, a->rd);
+TCGv_i64 rn = cpu_reg_sp(s, a->rn);
+tcg_gen_addi_i64(rd, rn, a->imm * pred_full_reg_size(s));
+}
+
+static void trans_RDVL(DisasContext *s, arg_RDVL *a, uint32_t insn)
+{
+TCGv_i64 reg = cpu_reg(s, a->rd);
+tcg_gen_movi_i64(reg, a->imm * vec_full_reg_size(s));
+}
+
 /*
  *** SVE Predicate Logical Operations Group
  */
diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index d7b078e92f..0b47869dcd 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -86,6 +86,9 @@
 # One register operand, with governing predicate, vector element size
 @rd_pg_rn   esz:2 ... ... ... pg:3 rn:5 rd:5   &rpr_esz
 
+# Two register operands with a 6-bit signed immediate.
+@rd_rn_i6   ... rn:5 . imm:s6 rd:5 &rri
+
 # Two register operand, one immediate operand, with predicate,
 # element size encoded as TSZHL.  User must fill in imm.
 @rdn_pg_tszimm  .. ... ... ... pg:3 . rd:5 \
@@ -240,6 +243,15 @@ INDEX_ri   0100 esz:2 1 imm:s5 010001 rn:5 rd:5
 # SVE index generation (register start, register increment)
 INDEX_rr   0100 .. 1 . 010011 . .  @rd_rn_rm
 
+### SVE Stack Allocation Group
+
+# SVE stack frame adjustment
+ADDVL  0100 001 . 01010 .. .   @rd_rn_i6
+ADDPL  0100 011 . 01010 .. .   @rd_rn_i6
+
+# SVE stack frame size
+RDVL   0100 101 1 01010 imm:s6 rd:5
+
 ### SVE Predicate Logical Operations Group
 
 # SVE predicate logical operations
-- 
2.14.3




[Qemu-devel] [PATCH v2 10/67] target/arm: Implement SVE Integer Reduction Group

2018-02-17 Thread Richard Henderson
Excepting MOVPRFX, which isn't a reduction.  Presumably it is
placed within the group because of its encoding.

Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h| 44 +
 target/arm/sve_helper.c| 95 +-
 target/arm/translate-sve.c | 65 +++
 target/arm/sve.decode  | 22 +++
 4 files changed, 224 insertions(+), 2 deletions(-)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index 5b82ba1501..6b6bbeb272 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -168,6 +168,50 @@ DEF_HELPER_FLAGS_5(sve_udiv_zpzz_s, TCG_CALL_NO_RWG,
 DEF_HELPER_FLAGS_5(sve_udiv_zpzz_d, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_3(sve_orv_b, TCG_CALL_NO_RWG, i64, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_orv_h, TCG_CALL_NO_RWG, i64, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_orv_s, TCG_CALL_NO_RWG, i64, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_orv_d, TCG_CALL_NO_RWG, i64, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(sve_eorv_b, TCG_CALL_NO_RWG, i64, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_eorv_h, TCG_CALL_NO_RWG, i64, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_eorv_s, TCG_CALL_NO_RWG, i64, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_eorv_d, TCG_CALL_NO_RWG, i64, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(sve_andv_b, TCG_CALL_NO_RWG, i64, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_andv_h, TCG_CALL_NO_RWG, i64, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_andv_s, TCG_CALL_NO_RWG, i64, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_andv_d, TCG_CALL_NO_RWG, i64, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(sve_saddv_b, TCG_CALL_NO_RWG, i64, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_saddv_h, TCG_CALL_NO_RWG, i64, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_saddv_s, TCG_CALL_NO_RWG, i64, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(sve_uaddv_b, TCG_CALL_NO_RWG, i64, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_uaddv_h, TCG_CALL_NO_RWG, i64, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_uaddv_s, TCG_CALL_NO_RWG, i64, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_uaddv_d, TCG_CALL_NO_RWG, i64, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(sve_smaxv_b, TCG_CALL_NO_RWG, i64, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_smaxv_h, TCG_CALL_NO_RWG, i64, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_smaxv_s, TCG_CALL_NO_RWG, i64, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_smaxv_d, TCG_CALL_NO_RWG, i64, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(sve_umaxv_b, TCG_CALL_NO_RWG, i64, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_umaxv_h, TCG_CALL_NO_RWG, i64, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_umaxv_s, TCG_CALL_NO_RWG, i64, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_umaxv_d, TCG_CALL_NO_RWG, i64, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(sve_sminv_b, TCG_CALL_NO_RWG, i64, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_sminv_h, TCG_CALL_NO_RWG, i64, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_sminv_s, TCG_CALL_NO_RWG, i64, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_sminv_d, TCG_CALL_NO_RWG, i64, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(sve_uminv_b, TCG_CALL_NO_RWG, i64, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_uminv_h, TCG_CALL_NO_RWG, i64, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_uminv_s, TCG_CALL_NO_RWG, i64, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_uminv_d, TCG_CALL_NO_RWG, i64, ptr, ptr, i32)
+
 DEF_HELPER_FLAGS_5(sve_and_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
 DEF_HELPER_FLAGS_5(sve_bic_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
 DEF_HELPER_FLAGS_5(sve_eor_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index 26c177c2fd..18fb27805e 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -295,6 +295,99 @@ DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_DIV)
 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_DIV)
 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_DIV)
 
+#undef DO_ZPZZ
+#undef DO_ZPZZ_D
+
+/* Two-operand reduction expander, controlled by a predicate.
+ * The difference between TYPERED and TYPERET has to do with
+ * sign-extension.  E.g. for SMAX, TYPERED must be signed,
+ * but TYPERET must be unsigned so that e.g. a 32-bit value
+ * is not sign-extended to the ABI uint64_t return type.
+ */
+/* ??? If we were to vectorize this by hand the reduction ordering
+ * would change.  For integer operands, this is perfectly fine.
+ */
+#define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
+uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc)   \
+{  \
+intptr_t i, opr_sz = simd_oprsz(desc); \
+TYPERED ret = INIT;\
+for (i = 0; i < opr_sz; ) {\
+uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));\
+do {   \
+if (pg & 1) {  \
+TYPEELT nn = *(TYPEELT *)(vn + H(i));  \
+ret = OP(ret, nn); \
+}   

[Qemu-devel] [PATCH v2 33/67] target/arm: Implement SVE reverse within elements

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h| 14 ++
 target/arm/sve_helper.c| 41 ++---
 target/arm/translate-sve.c | 38 ++
 target/arm/sve.decode  |  7 +++
 4 files changed, 93 insertions(+), 7 deletions(-)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index a58fb4ba01..3b7c54905d 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -465,6 +465,20 @@ DEF_HELPER_FLAGS_4(sve_compact_d, TCG_CALL_NO_RWG, void, 
ptr, ptr, ptr, i32)
 
 DEF_HELPER_FLAGS_2(sve_last_active_element, TCG_CALL_NO_RWG, s32, ptr, i32)
 
+DEF_HELPER_FLAGS_4(sve_revb_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_revb_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_revb_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve_revh_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_revh_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve_revw_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve_rbit_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_rbit_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_rbit_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_rbit_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
 DEF_HELPER_FLAGS_5(sve_and_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
 DEF_HELPER_FLAGS_5(sve_bic_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
 DEF_HELPER_FLAGS_5(sve_eor_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index ee289be642..a67bb579b8 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -237,6 +237,26 @@ static inline uint64_t expand_pred_s(uint8_t byte)
 return word[byte & 0x11];
 }
 
+/* Swap 16-bit words within a 32-bit word.  */
+static inline uint32_t hswap32(uint32_t h)
+{
+return rol32(h, 16);
+}
+
+/* Swap 16-bit words within a 64-bit word.  */
+static inline uint64_t hswap64(uint64_t h)
+{
+uint64_t m = 0xull;
+h = rol64(h, 32);
+return ((h & m) << 16) | ((h >> 16) & m);
+}
+
+/* Swap 32-bit words within a 64-bit word.  */
+static inline uint64_t wswap64(uint64_t h)
+{
+return rol64(h, 32);
+}
+
 #define LOGICAL_(NAME, FUNC) \
 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)  \
 { \
@@ -615,6 +635,20 @@ DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
 
+DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
+DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
+DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
+
+DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
+DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
+
+DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
+
+DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
+DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
+DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
+DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
+
 /* Three-operand expander, unpredicated, in which the third operand is "wide".
  */
 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP)   \
@@ -1577,13 +1611,6 @@ void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
 }
 }
 
-static inline uint64_t hswap64(uint64_t h)
-{
-uint64_t m = 0xull;
-h = rol64(h, 32);
-return ((h & m) << 16) | ((h >> 16) & m);
-}
-
 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
 {
 intptr_t i, j, opr_sz = simd_oprsz(desc);
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index fc2a295ab7..5a1ed379ad 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -2435,6 +2435,44 @@ static void trans_CPY_m_v(DisasContext *s, arg_rpr_esz 
*a, uint32_t insn)
 tcg_temp_free_i64(t);
 }
 
+static void trans_REVB(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+static gen_helper_gvec_3 * const fns[4] = {
+NULL,
+gen_helper_sve_revb_h,
+gen_helper_sve_revb_s,
+gen_helper_sve_revb_d,
+};
+do_zpz_ool(s, a, fns[a->esz]);
+}
+
+static void trans_REVH(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+static gen_helper_gvec_3 * const fns[4] = {
+NULL,
+NULL,
+gen_helper_sve_revh_s,
+gen_helper_sve_revh_d,
+};
+do_zpz_ool(s, a, fns[a->esz]);
+}
+
+static void trans_REVW(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+do_zpz_ool(s, a, a->esz == 3 ? gen_helper_sve_revw_d : NULL);
+}
+
+static void trans_RBIT(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+static gen_helper_gvec_3 * const fns[4] = {
+gen_helper_sve_rbit_b,
+gen_helper_sve_rbit_h,
+gen_helper_sve_rbit_s,
+gen_helper_sve_rbit_d,
+};
+do_zpz_ool(s, a, fns[a->esz]);
+}
+
 /

[Qemu-devel] [PATCH v2 13/67] target/arm: Implement SVE bitwise shift by wide elements (predicated)

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h| 21 +
 target/arm/sve_helper.c| 35 +++
 target/arm/translate-sve.c | 25 +
 target/arm/sve.decode  |  6 ++
 4 files changed, 87 insertions(+)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index 0cc02ee59e..d516580134 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -195,6 +195,27 @@ DEF_HELPER_FLAGS_5(sve_lsl_zpzz_s, TCG_CALL_NO_RWG,
 DEF_HELPER_FLAGS_5(sve_lsl_zpzz_d, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_5(sve_asr_zpzw_b, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_asr_zpzw_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_asr_zpzw_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve_lsr_zpzw_b, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_lsr_zpzw_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_lsr_zpzw_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve_lsl_zpzw_b, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_lsl_zpzw_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_lsl_zpzw_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
 DEF_HELPER_FLAGS_3(sve_orv_b, TCG_CALL_NO_RWG, i64, ptr, ptr, i32)
 DEF_HELPER_FLAGS_3(sve_orv_h, TCG_CALL_NO_RWG, i64, ptr, ptr, i32)
 DEF_HELPER_FLAGS_3(sve_orv_s, TCG_CALL_NO_RWG, i64, ptr, ptr, i32)
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index 6ea806d12b..3054b3cc99 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -464,6 +464,41 @@ DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
 #undef DO_ZPZZ
 #undef DO_ZPZZ_D
 
+/* Three-operand expander, controlled by a predicate, in which the
+ * third operand is "wide".  That is, for D = N op M, the same 64-bit
+ * value of M is used with all of the narrower values of N.
+ */
+#define DO_ZPZW(NAME, TYPE, TYPEW, H, OP)   \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
+{   \
+intptr_t i, opr_sz = simd_oprsz(desc);  \
+for (i = 0; i < opr_sz; ) { \
+uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
+TYPEW mm = *(TYPEW *)(vm + i);  \
+do {\
+if (pg & 1) {   \
+TYPE nn = *(TYPE *)(vn + H(i)); \
+*(TYPE *)(vd + H(i)) = OP(nn, mm);  \
+}   \
+i += sizeof(TYPE), pg >>= sizeof(TYPE); \
+} while (i & 7);\
+}   \
+}
+
+DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
+DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
+DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
+
+DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
+DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
+DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
+
+DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
+DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
+DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
+
+#undef DO_ZPZW
+
 /* Two-operand reduction expander, controlled by a predicate.
  * The difference between TYPERED and TYPERET has to do with
  * sign-extension.  E.g. for SMAX, TYPERED must be signed,
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index 08c56e55a0..35bcd9229d 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -473,6 +473,31 @@ static void trans_ASRD(DisasContext *s, arg_rpri_esz *a, 
uint32_t insn)
 }
 }
 
+/*
+ *** SVE Bitwise Shift - Predicated Group
+ */
+
+#define DO_ZPZW(NAME, name) \
+static void trans_##NAME##_zpzw(DisasContext *s, arg_rprr_esz *a, \
+uint32_t insn)\
+{ \
+static gen_helper_gvec_4 * const fns[3] = {   \
+gen_helper_sve_##name##_zpzw_b, gen_helper_sve_##name##_zpzw_h,   \
+gen_helper_sve_##name##_zpzw_s,   \
+};   

[Qemu-devel] [PATCH v2 20/67] target/arm: Implement SVE Compute Vector Address Group

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h|  5 +
 target/arm/sve_helper.c| 40 
 target/arm/translate-sve.c | 33 +
 target/arm/sve.decode  | 12 
 4 files changed, 90 insertions(+)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index 00e3cd48bb..5280d375f9 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -380,6 +380,11 @@ DEF_HELPER_FLAGS_4(sve_lsl_zzw_b, TCG_CALL_NO_RWG, void, 
ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(sve_lsl_zzw_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(sve_lsl_zzw_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_4(sve_adr_p32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_adr_p64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_adr_s32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_adr_u32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
 DEF_HELPER_FLAGS_5(sve_and_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
 DEF_HELPER_FLAGS_5(sve_bic_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
 DEF_HELPER_FLAGS_5(sve_eor_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index 4c6e2713fa..a290a58c02 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -1061,3 +1061,43 @@ void HELPER(sve_index_d)(void *vd, uint64_t start,
 d[i] = start + i * incr;
 }
 }
+
+void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+intptr_t i, opr_sz = simd_oprsz(desc) / 4;
+uint32_t sh = simd_data(desc);
+uint32_t *d = vd, *n = vn, *m = vm;
+for (i = 0; i < opr_sz; i += 1) {
+d[i] = n[i] + (m[i] << sh);
+}
+}
+
+void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+uint64_t sh = simd_data(desc);
+uint64_t *d = vd, *n = vn, *m = vm;
+for (i = 0; i < opr_sz; i += 1) {
+d[i] = n[i] + (m[i] << sh);
+}
+}
+
+void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+uint64_t sh = simd_data(desc);
+uint64_t *d = vd, *n = vn, *m = vm;
+for (i = 0; i < opr_sz; i += 1) {
+d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
+}
+}
+
+void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+uint64_t sh = simd_data(desc);
+uint64_t *d = vd, *n = vn, *m = vm;
+for (i = 0; i < opr_sz; i += 1) {
+d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
+}
+}
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index 43e9f1ad08..34cc8c2773 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -847,6 +847,39 @@ static void trans_RDVL(DisasContext *s, arg_RDVL *a, 
uint32_t insn)
 tcg_gen_movi_i64(reg, a->imm * vec_full_reg_size(s));
 }
 
+/*
+ *** SVE Compute Vector Address Group
+ */
+
+static void do_adr(DisasContext *s, arg_rrri *a, gen_helper_gvec_3 *fn)
+{
+unsigned vsz = vec_full_reg_size(s);
+tcg_gen_gvec_3_ool(vec_full_reg_offset(s, a->rd),
+   vec_full_reg_offset(s, a->rn),
+   vec_full_reg_offset(s, a->rm),
+   vsz, vsz, a->imm, fn);
+}
+
+static void trans_ADR_p32(DisasContext *s, arg_rrri *a, uint32_t insn)
+{
+do_adr(s, a, gen_helper_sve_adr_p32);
+}
+
+static void trans_ADR_p64(DisasContext *s, arg_rrri *a, uint32_t insn)
+{
+do_adr(s, a, gen_helper_sve_adr_p64);
+}
+
+static void trans_ADR_s32(DisasContext *s, arg_rrri *a, uint32_t insn)
+{
+do_adr(s, a, gen_helper_sve_adr_s32);
+}
+
+static void trans_ADR_u32(DisasContext *s, arg_rrri *a, uint32_t insn)
+{
+do_adr(s, a, gen_helper_sve_adr_u32);
+}
+
 /*
  *** SVE Predicate Logical Operations Group
  */
diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index f71ea1b60d..6ec1f94832 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -49,6 +49,7 @@
 
 &rr_eszrd rn esz
 &rri   rd rn imm
+&rrri  rd rn rm imm
 &rri_esz   rd rn imm esz
 &rrr_esz   rd rn rm esz
 &rpr_esz   rd pg rn esz
@@ -77,6 +78,9 @@
 # Three operand, vector element size
 @rd_rn_rm   esz:2 . rm:5  ... ...  rn:5 rd:5   &rrr_esz
 
+# Three operand with "memory" size, aka immediate left shift
+@rd_rn_msz_rm   ... rm:5  imm:2 rn:5 rd:5  &rrri
+
 # Two register operand, with governing predicate, vector element size
 @rdn_pg_rm  esz:2 ... ... ... pg:3 rm:5 rd:5 \
&rprr_esz rn=%reg_movprfx
@@ -278,6 +282,14 @@ ASR_zzw0100 .. 1 . 1000 00 . . 
@rd_rn_rm
 LSR_zzw0100 .. 1 . 1000 01 . . 
@rd_rn_rm
 LSL_zzw0100 .. 1 ..

[Qemu-devel] [PATCH v2 36/67] target/arm: Implement SVE Integer Compare - Vectors Group

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h| 115 +++
 target/arm/sve_helper.c| 193 -
 target/arm/translate-sve.c |  87 
 target/arm/sve.decode  |  24 ++
 4 files changed, 416 insertions(+), 3 deletions(-)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index 0f57f64895..6ffd1fbe8e 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -490,6 +490,121 @@ DEF_HELPER_FLAGS_4(sve_rbit_d, TCG_CALL_NO_RWG, void, 
ptr, ptr, ptr, i32)
 
 DEF_HELPER_FLAGS_5(sve_splice, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_5(sve_cmpeq_ppzz_b, TCG_CALL_NO_RWG,
+   i32, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_cmpne_ppzz_b, TCG_CALL_NO_RWG,
+   i32, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_cmpge_ppzz_b, TCG_CALL_NO_RWG,
+   i32, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_cmpgt_ppzz_b, TCG_CALL_NO_RWG,
+   i32, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_cmphi_ppzz_b, TCG_CALL_NO_RWG,
+   i32, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_cmphs_ppzz_b, TCG_CALL_NO_RWG,
+   i32, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve_cmpeq_ppzz_h, TCG_CALL_NO_RWG,
+   i32, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_cmpne_ppzz_h, TCG_CALL_NO_RWG,
+   i32, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_cmpge_ppzz_h, TCG_CALL_NO_RWG,
+   i32, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_cmpgt_ppzz_h, TCG_CALL_NO_RWG,
+   i32, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_cmphi_ppzz_h, TCG_CALL_NO_RWG,
+   i32, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_cmphs_ppzz_h, TCG_CALL_NO_RWG,
+   i32, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve_cmpeq_ppzz_s, TCG_CALL_NO_RWG,
+   i32, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_cmpne_ppzz_s, TCG_CALL_NO_RWG,
+   i32, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_cmpge_ppzz_s, TCG_CALL_NO_RWG,
+   i32, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_cmpgt_ppzz_s, TCG_CALL_NO_RWG,
+   i32, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_cmphi_ppzz_s, TCG_CALL_NO_RWG,
+   i32, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_cmphs_ppzz_s, TCG_CALL_NO_RWG,
+   i32, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve_cmpeq_ppzz_d, TCG_CALL_NO_RWG,
+   i32, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_cmpne_ppzz_d, TCG_CALL_NO_RWG,
+   i32, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_cmpge_ppzz_d, TCG_CALL_NO_RWG,
+   i32, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_cmpgt_ppzz_d, TCG_CALL_NO_RWG,
+   i32, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_cmphi_ppzz_d, TCG_CALL_NO_RWG,
+   i32, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_cmphs_ppzz_d, TCG_CALL_NO_RWG,
+   i32, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve_cmpeq_ppzw_b, TCG_CALL_NO_RWG,
+   i32, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_cmpne_ppzw_b, TCG_CALL_NO_RWG,
+   i32, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_cmpge_ppzw_b, TCG_CALL_NO_RWG,
+   i32, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_cmpgt_ppzw_b, TCG_CALL_NO_RWG,
+   i32, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_cmphi_ppzw_b, TCG_CALL_NO_RWG,
+   i32, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_cmphs_ppzw_b, TCG_CALL_NO_RWG,
+   i32, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_cmple_ppzw_b, TCG_CALL_NO_RWG,
+   i32, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_cmplt_ppzw_b, TCG_CALL_NO_RWG,
+   i32, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_cmplo_ppzw_b, TCG_CALL_NO_RWG,
+   i32, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_cmpls_ppzw_b, TCG_CALL_NO_RWG,
+   i32, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve_cmpeq_ppzw_h, TCG_CALL_NO_RWG,
+   i32, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_cmpne_ppzw_h, TCG_CALL_NO_RWG,
+   i32, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_cmpge_ppzw_h, TCG_CALL_NO_RWG,
+   i32, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_cmpgt_ppzw_h, TCG_CALL_NO_RWG,
+   i32, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_cmphi_ppzw_h, TCG_CALL_NO_RWG,
+   i32, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_cmphs_ppzw_h, TCG_CALL_NO_RWG,
+   i32, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_cmple_ppzw_h, TCG_CALL_NO_RWG,
+   i32, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLA

[Qemu-devel] [PATCH v2 11/67] target/arm: Implement SVE bitwise shift by immediate (predicated)

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h|  25 +
 target/arm/sve_helper.c| 265 +
 target/arm/translate-sve.c | 128 ++
 target/arm/sve.decode  |  29 -
 4 files changed, 445 insertions(+), 2 deletions(-)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index 6b6bbeb272..b3c89579af 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -212,6 +212,31 @@ DEF_HELPER_FLAGS_3(sve_uminv_h, TCG_CALL_NO_RWG, i64, ptr, 
ptr, i32)
 DEF_HELPER_FLAGS_3(sve_uminv_s, TCG_CALL_NO_RWG, i64, ptr, ptr, i32)
 DEF_HELPER_FLAGS_3(sve_uminv_d, TCG_CALL_NO_RWG, i64, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_3(sve_clr_b, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_clr_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_clr_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_clr_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve_asr_zpzi_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_asr_zpzi_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_asr_zpzi_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_asr_zpzi_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve_lsr_zpzi_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_lsr_zpzi_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_lsr_zpzi_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_lsr_zpzi_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve_lsl_zpzi_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_lsl_zpzi_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_lsl_zpzi_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_lsl_zpzi_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve_asrd_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_asrd_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_asrd_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_asrd_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
 DEF_HELPER_FLAGS_5(sve_and_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
 DEF_HELPER_FLAGS_5(sve_bic_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
 DEF_HELPER_FLAGS_5(sve_eor_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index 18fb27805e..b1a170fd70 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -92,6 +92,150 @@ uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t 
words)
 return flags;
 }
 
+/* Expand active predicate bits to bytes, for byte elements.
+ *  for (i = 0; i < 256; ++i) {
+ *  unsigned long m = 0;
+ *  for (j = 0; j < 8; j++) {
+ *  if ((i >> j) & 1) {
+ *  m |= 0xfful << (j << 3);
+ *  }
+ *  }
+ *  printf("0x%016lx,\n", m);
+ *  }
+ */
+static inline uint64_t expand_pred_b(uint8_t byte)
+{
+static const uint64_t word[256] = {
+0x, 0x00ff, 0xff00,
+0x, 0x00ff, 0x00ff00ff,
+0x0000, 0x00ff, 0xff00,
+0xffff, 0xff00ff00, 0xff00,
+0x, 0x00ff, 0xff00,
+0x, 0x00ff, 0x00ff00ff,
+0x00ffff00, 0x00ff, 0x00ff00ff,
+0x00ff00ff00ff, 0x00ff0000, 0x00ff00ff,
+0x0000, 0x00ff, 0x0000ff00,
+0x0000, 0x00ff, 0x00ff00ff,
+0x0000, 0x00ff, 0xff00,
+0xffff, 0xff00ff00, 0xff00,
+0xffff, 0xffff00ff, 0xff00,
+0xffff, 0xff00ff00, 0xff00ffff,
+0xff00ff00ff00, 0xff00ff00, 0xff00,
+0xff0000ff, 0xff00ff00, 0xff00,
+0x, 0x00ff, 0xff00,
+0x, 0x00ff, 0x00ff00ff,
+0x0000, 0x00ff, 0xff00,
+0xffff, 0xff00ff00, 0xff00,
+0x, 0x00ff, 0xff00,
+0x, 0x00ff, 0x00ff00ff,
+0x00ffff00, 0x00ff, 0x00ff00ff,
+0x00ff00ff00ff, 0x00ff0000, 0x00ff00ff,
+0x00ffff00, 0x00ffffff, 0x00ffff00ff00,
+0x00ffff00, 0x00ff, 0x00ff00ff,
+0x00ffff00, 0x00ff,

[Qemu-devel] [PATCH v2 14/67] target/arm: Implement SVE Integer Arithmetic - Unary Predicated Group

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h|  60 +
 target/arm/sve_helper.c| 127 +
 target/arm/translate-sve.c | 111 +++
 target/arm/sve.decode  |  23 
 4 files changed, 321 insertions(+)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index d516580134..11644125d1 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -285,6 +285,66 @@ DEF_HELPER_FLAGS_4(sve_asrd_h, TCG_CALL_NO_RWG, void, ptr, 
ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(sve_asrd_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(sve_asrd_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_4(sve_cls_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_cls_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_cls_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_cls_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve_clz_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_clz_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_clz_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_clz_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve_cnt_zpz_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_cnt_zpz_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_cnt_zpz_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_cnt_zpz_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve_cnot_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_cnot_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_cnot_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_cnot_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve_fabs_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_fabs_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_fabs_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve_fneg_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_fneg_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_fneg_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve_not_zpz_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_not_zpz_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_not_zpz_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_not_zpz_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve_sxtb_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_sxtb_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_sxtb_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve_uxtb_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_uxtb_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_uxtb_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve_sxth_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_sxth_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve_uxth_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_uxth_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve_sxtw_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_uxtw_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve_abs_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_abs_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_abs_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_abs_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve_neg_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_neg_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_neg_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_neg_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
 DEF_HELPER_FLAGS_5(sve_and_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
 DEF_HELPER_FLAGS_5(sve_bic_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
 DEF_HELPER_FLAGS_5(sve_eor_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index 3054b3cc99..e11823a727 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -499,6 +499,133 @@ DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
 
 #undef DO_ZPZW
 
+/* Fully general two-operand expander, controlled by a predicate.
+ */
+#define DO_ZPZ(NAME, TYPE, H, OP)   \
+void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
+{  

[Qemu-devel] [PATCH v2 26/67] target/arm: Implement SVE Permute - Extract Group

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h|  2 ++
 target/arm/sve_helper.c| 81 ++
 target/arm/translate-sve.c | 29 +
 target/arm/sve.decode  |  9 +-
 4 files changed, 120 insertions(+), 1 deletion(-)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index 79493ab647..94f4356ce9 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -414,6 +414,8 @@ DEF_HELPER_FLAGS_4(sve_cpy_z_h, TCG_CALL_NO_RWG, void, ptr, 
ptr, i64, i32)
 DEF_HELPER_FLAGS_4(sve_cpy_z_s, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
 DEF_HELPER_FLAGS_4(sve_cpy_z_d, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
 
+DEF_HELPER_FLAGS_4(sve_ext, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
 DEF_HELPER_FLAGS_5(sve_and_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
 DEF_HELPER_FLAGS_5(sve_bic_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
 DEF_HELPER_FLAGS_5(sve_eor_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index 6a95d1ec48..fb3f54300b 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -1469,3 +1469,84 @@ void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t 
val, uint32_t desc)
 d[i] = (pg[H1(i)] & 1 ? val : 0);
 }
 }
+
+/* Big-endian hosts need to frob the byte indicies.  If the copy
+ * happens to be 8-byte aligned, then no frobbing necessary.
+ */
+static void swap_memmove(void *vd, void *vs, size_t n)
+{
+uintptr_t d = (uintptr_t)vd;
+uintptr_t s = (uintptr_t)vs;
+uintptr_t o = (d | s | n) & 7;
+size_t i;
+
+#ifndef HOST_WORDS_BIGENDIAN
+o = 0;
+#endif
+switch (o) {
+case 0:
+memmove(vd, vs, n);
+break;
+
+case 4:
+if (d < s || d >= s + n) {
+for (i = 0; i < n; i += 4) {
+*(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
+}
+} else {
+for (i = n; i > 0; ) {
+i -= 4;
+*(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
+}
+}
+break;
+
+case 2:
+case 6:
+if (d < s || d >= s + n) {
+for (i = 0; i < n; i += 2) {
+*(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
+}
+} else {
+for (i = n; i > 0; ) {
+i -= 2;
+*(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
+}
+}
+break;
+
+default:
+if (d < s || d >= s + n) {
+for (i = 0; i < n; i++) {
+*(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
+}
+} else {
+for (i = n; i > 0; ) {
+i -= 1;
+*(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
+}
+}
+break;
+}
+}
+
+void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+intptr_t opr_sz = simd_oprsz(desc);
+size_t n_ofs = simd_data(desc);
+size_t n_siz = opr_sz - n_ofs;
+
+if (vd != vm) {
+swap_memmove(vd, vn + n_ofs, n_siz);
+swap_memmove(vd + n_siz, vm, n_ofs);
+} else if (vd != vn) {
+swap_memmove(vd + n_siz, vd, n_ofs);
+swap_memmove(vd, vn + n_ofs, n_siz);
+} else {
+/* vd == vn == vm.  Need temp space.  */
+ARMVectorReg tmp;
+swap_memmove(&tmp, vm, n_ofs);
+swap_memmove(vd, vd + n_ofs, n_siz);
+memcpy(vd + n_siz, &tmp, n_ofs);
+}
+}
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index dd085b084b..07a5eac092 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -1790,6 +1790,35 @@ static void trans_CPY_z_i(DisasContext *s, arg_CPY_z_i 
*a, uint32_t insn)
 tcg_temp_free_i64(t_imm);
 }
 
+/*
+ *** SVE Permute Extract Group
+ */
+
+static void trans_EXT(DisasContext *s, arg_EXT *a, uint32_t insn)
+{
+unsigned vsz = vec_full_reg_size(s);
+unsigned n_ofs = a->imm >= vsz ? 0 : a->imm;
+unsigned n_siz = vsz - n_ofs;
+unsigned d = vec_full_reg_offset(s, a->rd);
+unsigned n = vec_full_reg_offset(s, a->rn);
+unsigned m = vec_full_reg_offset(s, a->rm);
+
+/* Use host vector move insns if we have appropriate sizes
+   and no unfortunate overlap.  */
+if (m != d
+&& n_ofs == size_for_gvec(n_ofs)
+&& n_siz == size_for_gvec(n_siz)
+&& (d != n || n_siz <= n_ofs)) {
+tcg_gen_gvec_mov(0, d, n + n_ofs, n_siz, n_siz);
+if (n_ofs != 0) {
+tcg_gen_gvec_mov(0, d + n_siz, m, n_ofs, n_ofs);
+}
+return;
+}
+
+tcg_gen_gvec_3_ool(d, n, m, vsz, vsz, n_ofs, gen_helper_sve_ext);
+}
+
 /*
  *** SVE Memory - 32-bit Gather and Unsized Contiguous Group
  */
diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index e6e10a4f84..5e3a9839d4 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -22,8 +22,9 @@
 

[Qemu-devel] [PATCH v2 39/67] target/arm: Implement SVE Predicate Count Group

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h|   2 +
 target/arm/sve_helper.c|  14 ++
 target/arm/translate-sve.c | 116 +
 target/arm/sve.decode  |  27 +++
 4 files changed, 159 insertions(+)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index f0a3ed3414..dd4f8f754d 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -676,3 +676,5 @@ DEF_HELPER_FLAGS_4(sve_brkbs_m, TCG_CALL_NO_RWG, i32, ptr, 
ptr, ptr, i32)
 
 DEF_HELPER_FLAGS_4(sve_brkn, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(sve_brkns, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(sve_cntp, TCG_CALL_NO_RWG, i64, ptr, ptr, i32)
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index d6d2220f8b..dd884bdd1c 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -2702,3 +2702,17 @@ uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, 
uint32_t pred_desc)
 return do_zero(vd, oprsz);
 }
 }
+
+uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
+{
+intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
+intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
+uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
+intptr_t i;
+
+for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
+uint64_t t = n[i] & g[i] & mask;
+sum += ctpop64(t);
+}
+return sum;
+}
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index dc95d68867..038800cc86 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -36,6 +36,8 @@
 typedef void GVecGen2Fn(unsigned, uint32_t, uint32_t, uint32_t, uint32_t);
 typedef void GVecGen2iFn(unsigned, uint32_t, uint32_t,
  int64_t, uint32_t, uint32_t);
+typedef void GVecGen2sFn(unsigned, uint32_t, uint32_t,
+ TCGv_i64, uint32_t, uint32_t);
 typedef void GVecGen3Fn(unsigned, uint32_t, uint32_t,
 uint32_t, uint32_t, uint32_t);
 
@@ -2731,6 +2733,120 @@ void trans_BRKN(DisasContext *s, arg_rpr_s *a, uint32_t 
insn)
 do_brk2(s, a, gen_helper_sve_brkn, gen_helper_sve_brkns);
 }
 
+/*
+ *** SVE Predicate Count Group
+ */
+
+static void do_cntp(DisasContext *s, TCGv_i64 val, int esz, int pn, int pg)
+{
+unsigned psz = pred_full_reg_size(s);
+
+if (psz <= 8) {
+uint64_t psz_mask;
+
+tcg_gen_ld_i64(val, cpu_env, pred_full_reg_offset(s, pn));
+if (pn != pg) {
+TCGv_i64 g = tcg_temp_new_i64();
+tcg_gen_ld_i64(g, cpu_env, pred_full_reg_offset(s, pg));
+tcg_gen_and_i64(val, val, g);
+tcg_temp_free_i64(g);
+}
+
+/* Reduce the pred_esz_masks value simply to reduce the
+   size of the code generated here.  */
+psz_mask = deposit64(0, 0, psz * 8, -1);
+tcg_gen_andi_i64(val, val, pred_esz_masks[esz] & psz_mask);
+
+tcg_gen_ctpop_i64(val, val);
+} else {
+TCGv_ptr t_pn = tcg_temp_new_ptr();
+TCGv_ptr t_pg = tcg_temp_new_ptr();
+unsigned desc;
+TCGv_i32 t_desc;
+
+desc = psz - 2;
+desc = deposit32(desc, SIMD_DATA_SHIFT, 2, esz);
+
+tcg_gen_addi_ptr(t_pn, cpu_env, pred_full_reg_offset(s, pn));
+tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, pg));
+t_desc = tcg_const_i32(desc);
+
+gen_helper_sve_cntp(val, t_pn, t_pg, t_desc);
+tcg_temp_free_ptr(t_pn);
+tcg_temp_free_ptr(t_pg);
+tcg_temp_free_i32(t_desc);
+}
+}
+
+static void trans_CNTP(DisasContext *s, arg_CNTP *a, uint32_t insn)
+{
+do_cntp(s, cpu_reg(s, a->rd), a->esz, a->rn, a->pg);
+}
+
+static void trans_INCDECP_r(DisasContext *s, arg_incdec_pred *a,
+uint32_t insn)
+{
+TCGv_i64 reg = cpu_reg(s, a->rd);
+TCGv_i64 val = tcg_temp_new_i64();
+
+do_cntp(s, val, a->esz, a->pg, a->pg);
+if (a->d) {
+tcg_gen_sub_i64(reg, reg, val);
+} else {
+tcg_gen_add_i64(reg, reg, val);
+}
+tcg_temp_free_i64(val);
+}
+
+static void trans_INCDECP_z(DisasContext *s, arg_incdec2_pred *a,
+uint32_t insn)
+{
+unsigned vsz = vec_full_reg_size(s);
+TCGv_i64 val = tcg_temp_new_i64();
+GVecGen2sFn *gvec_fn = a->d ? tcg_gen_gvec_subs : tcg_gen_gvec_adds;
+
+if (a->esz == 0) {
+unallocated_encoding(s);
+return;
+}
+do_cntp(s, val, a->esz, a->pg, a->pg);
+gvec_fn(a->esz, vec_full_reg_offset(s, a->rd),
+vec_full_reg_offset(s, a->rn), val, vsz, vsz);
+}
+
+static void trans_SINCDECP_r_32(DisasContext *s, arg_incdec_pred *a,
+uint32_t insn)
+{
+TCGv_i64 reg = cpu_reg(s, a->rd);
+TCGv_i64 val = tcg_temp_new_i64();
+
+do_cntp(s, val, a->esz, a->pg, a->pg);
+do_sat_addsub_32(reg, val, a->u, a->d);
+}
+
+static void trans_SIN

[Qemu-devel] [PATCH v2 19/67] target/arm: Implement SVE Bitwise Shift - Unpredicated Group

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h| 12 +++
 target/arm/sve_helper.c| 30 +
 target/arm/translate-sve.c | 81 ++
 target/arm/sve.decode  | 26 +++
 4 files changed, 149 insertions(+)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index 2a2dbe98dd..00e3cd48bb 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -368,6 +368,18 @@ DEF_HELPER_FLAGS_4(sve_index_h, TCG_CALL_NO_RWG, void, 
ptr, i32, i32, i32)
 DEF_HELPER_FLAGS_4(sve_index_s, TCG_CALL_NO_RWG, void, ptr, i32, i32, i32)
 DEF_HELPER_FLAGS_4(sve_index_d, TCG_CALL_NO_RWG, void, ptr, i64, i64, i32)
 
+DEF_HELPER_FLAGS_4(sve_asr_zzw_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_asr_zzw_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_asr_zzw_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve_lsr_zzw_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_lsr_zzw_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_lsr_zzw_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve_lsl_zzw_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_lsl_zzw_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_lsl_zzw_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
 DEF_HELPER_FLAGS_5(sve_and_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
 DEF_HELPER_FLAGS_5(sve_bic_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
 DEF_HELPER_FLAGS_5(sve_eor_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index 950012e70a..4c6e2713fa 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -614,6 +614,36 @@ DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
 
+/* Three-operand expander, unpredicated, in which the third operand is "wide".
+ */
+#define DO_ZZW(NAME, TYPE, TYPEW, H, OP)   \
+void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
+{  \
+intptr_t i, opr_sz = simd_oprsz(desc); \
+for (i = 0; i < opr_sz; ) {\
+TYPEW mm = *(TYPEW *)(vm + i); \
+do {   \
+TYPE nn = *(TYPE *)(vn + H(i));\
+*(TYPE *)(vd + H(i)) = OP(nn, mm); \
+i += sizeof(TYPE); \
+} while (i & 7);   \
+}  \
+}
+
+DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
+DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
+DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
+
+DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
+DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
+DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
+
+DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
+DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
+DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
+
+#undef DO_ZZW
+
 #undef DO_CLS_B
 #undef DO_CLS_H
 #undef DO_CLZ_B
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index 4a38020c8a..43e9f1ad08 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -130,6 +130,13 @@ static void do_mov_z(DisasContext *s, int rd, int rn)
 do_vector2_z(s, tcg_gen_gvec_mov, 0, rd, rn);
 }
 
+/* Initialize a Zreg with replications of a 64-bit immediate.  */
+static void do_dupi_z(DisasContext *s, int rd, uint64_t word)
+{
+unsigned vsz = vec_full_reg_size(s);
+tcg_gen_gvec_dup64i(vec_full_reg_offset(s, rd), vsz, vsz, word);
+}
+
 /* Invoke a vector expander on two Pregs.  */
 static void do_vector2_p(DisasContext *s, GVecGen2Fn *gvec_fn,
  int esz, int rd, int rn)
@@ -644,6 +651,80 @@ DO_ZPZW(LSL, lsl)
 
 #undef DO_ZPZW
 
+/*
+ *** SVE Bitwise Shift - Unpredicated Group
+ */
+
+static void do_shift_imm(DisasContext *s, arg_rri_esz *a, bool asr,
+ void (*gvec_fn)(unsigned, uint32_t, uint32_t,
+ int64_t, uint32_t, uint32_t))
+{
+unsigned vsz = vec_full_reg_size(s);
+if (a->esz < 0) {
+/* Invalid tsz encoding -- see tszimm_esz. */
+unallocated_encoding(s);
+return;
+}
+/* Shift by element size is architecturally valid.  For
+   arithmetic right-shift, it's the same as by one less.
+   Otherwise it is a zeroing operation.  */
+if (a->imm >= 8 << a->esz) {
+if (asr) {
+a->imm = (8 << a->esz) - 1;
+} else {
+do_dupi_z(s, a->rd, 0);
+return;
+  

[Qemu-devel] [PATCH v2 17/67] target/arm: Implement SVE Index Generation Group

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h|  5 
 target/arm/sve_helper.c| 40 +++
 target/arm/translate-sve.c | 67 ++
 target/arm/sve.decode  | 14 ++
 4 files changed, 126 insertions(+)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index b31d497f31..2a2dbe98dd 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -363,6 +363,11 @@ DEF_HELPER_FLAGS_6(sve_mls_s, TCG_CALL_NO_RWG,
 DEF_HELPER_FLAGS_6(sve_mls_d, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_4(sve_index_b, TCG_CALL_NO_RWG, void, ptr, i32, i32, i32)
+DEF_HELPER_FLAGS_4(sve_index_h, TCG_CALL_NO_RWG, void, ptr, i32, i32, i32)
+DEF_HELPER_FLAGS_4(sve_index_s, TCG_CALL_NO_RWG, void, ptr, i32, i32, i32)
+DEF_HELPER_FLAGS_4(sve_index_d, TCG_CALL_NO_RWG, void, ptr, i64, i64, i32)
+
 DEF_HELPER_FLAGS_5(sve_and_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
 DEF_HELPER_FLAGS_5(sve_bic_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
 DEF_HELPER_FLAGS_5(sve_eor_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index 4b08a38ce8..950012e70a 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -991,3 +991,43 @@ DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
 #undef DO_MLS
 #undef DO_ZPZZZ
 #undef DO_ZPZZZ_D
+
+void HELPER(sve_index_b)(void *vd, uint32_t start,
+ uint32_t incr, uint32_t desc)
+{
+intptr_t i, opr_sz = simd_oprsz(desc);
+uint8_t *d = vd;
+for (i = 0; i < opr_sz; i += 1) {
+d[H1(i)] = start + i * incr;
+}
+}
+
+void HELPER(sve_index_h)(void *vd, uint32_t start,
+ uint32_t incr, uint32_t desc)
+{
+intptr_t i, opr_sz = simd_oprsz(desc) / 2;
+uint16_t *d = vd;
+for (i = 0; i < opr_sz; i += 1) {
+d[H2(i)] = start + i * incr;
+}
+}
+
+void HELPER(sve_index_s)(void *vd, uint32_t start,
+ uint32_t incr, uint32_t desc)
+{
+intptr_t i, opr_sz = simd_oprsz(desc) / 4;
+uint32_t *d = vd;
+for (i = 0; i < opr_sz; i += 1) {
+d[H4(i)] = start + i * incr;
+}
+}
+
+void HELPER(sve_index_d)(void *vd, uint64_t start,
+ uint64_t incr, uint32_t desc)
+{
+intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+uint64_t *d = vd;
+for (i = 0; i < opr_sz; i += 1) {
+d[i] = start + i * incr;
+}
+}
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index 8baec6c674..773f0bfded 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -675,6 +675,73 @@ DO_ZPZZZ(MLS, mls)
 
 #undef DO_ZPZZZ
 
+/*
+ *** SVE Index Generation Group
+ */
+
+static void do_index(DisasContext *s, int esz, int rd,
+ TCGv_i64 start, TCGv_i64 incr)
+{
+unsigned vsz = vec_full_reg_size(s);
+TCGv_i32 desc = tcg_const_i32(simd_desc(vsz, vsz, 0));
+TCGv_ptr t_zd = tcg_temp_new_ptr();
+
+tcg_gen_addi_ptr(t_zd, cpu_env, vec_full_reg_offset(s, rd));
+if (esz == 3) {
+gen_helper_sve_index_d(t_zd, start, incr, desc);
+} else {
+typedef void index_fn(TCGv_ptr, TCGv_i32, TCGv_i32, TCGv_i32);
+static index_fn * const fns[3] = {
+gen_helper_sve_index_b,
+gen_helper_sve_index_h,
+gen_helper_sve_index_s,
+};
+TCGv_i32 s32 = tcg_temp_new_i32();
+TCGv_i32 i32 = tcg_temp_new_i32();
+
+tcg_gen_extrl_i64_i32(s32, start);
+tcg_gen_extrl_i64_i32(i32, incr);
+fns[esz](t_zd, s32, i32, desc);
+
+tcg_temp_free_i32(s32);
+tcg_temp_free_i32(i32);
+}
+tcg_temp_free_ptr(t_zd);
+tcg_temp_free_i32(desc);
+}
+
+static void trans_INDEX_ii(DisasContext *s, arg_INDEX_ii *a, uint32_t insn)
+{
+TCGv_i64 start = tcg_const_i64(a->imm1);
+TCGv_i64 incr = tcg_const_i64(a->imm2);
+do_index(s, a->esz, a->rd, start, incr);
+tcg_temp_free_i64(start);
+tcg_temp_free_i64(incr);
+}
+
+static void trans_INDEX_ir(DisasContext *s, arg_INDEX_ir *a, uint32_t insn)
+{
+TCGv_i64 start = tcg_const_i64(a->imm);
+TCGv_i64 incr = cpu_reg(s, a->rm);
+do_index(s, a->esz, a->rd, start, incr);
+tcg_temp_free_i64(start);
+}
+
+static void trans_INDEX_ri(DisasContext *s, arg_INDEX_ri *a, uint32_t insn)
+{
+TCGv_i64 start = cpu_reg(s, a->rn);
+TCGv_i64 incr = tcg_const_i64(a->imm);
+do_index(s, a->esz, a->rd, start, incr);
+tcg_temp_free_i64(incr);
+}
+
+static void trans_INDEX_rr(DisasContext *s, arg_INDEX_rr *a, uint32_t insn)
+{
+TCGv_i64 start = cpu_reg(s, a->rn);
+TCGv_i64 incr = cpu_reg(s, a->rm);
+do_index(s, a->esz, a->rd, start, incr);
+}
+
 /*
  *** SVE Predicate Logical Operations Group
  */
diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index b40d7dc9a2..d7b078e92f 100644
--- a/target/arm/sve.decode
+++ b/target/

[Qemu-devel] [PATCH v2 43/67] target/arm: Implement SVE Floating Point Arithmetic - Unpredicated Group

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h| 14 +++
 target/arm/helper.h| 19 ++
 target/arm/translate-sve.c | 41 
 target/arm/vec_helper.c| 94 ++
 target/arm/Makefile.objs   |  2 +-
 target/arm/sve.decode  | 10 +
 6 files changed, 179 insertions(+), 1 deletion(-)
 create mode 100644 target/arm/vec_helper.c

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index 97bfe0f47b..2e76084992 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -705,3 +705,17 @@ DEF_HELPER_FLAGS_4(sve_umini_b, TCG_CALL_NO_RWG, void, 
ptr, ptr, i64, i32)
 DEF_HELPER_FLAGS_4(sve_umini_h, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
 DEF_HELPER_FLAGS_4(sve_umini_s, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
 DEF_HELPER_FLAGS_4(sve_umini_d, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+
+DEF_HELPER_FLAGS_5(gvec_recps_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_recps_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_recps_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(gvec_rsqrts_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_rsqrts_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_rsqrts_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
diff --git a/target/arm/helper.h b/target/arm/helper.h
index be3c2fcdc0..f3ce58e276 100644
--- a/target/arm/helper.h
+++ b/target/arm/helper.h
@@ -565,6 +565,25 @@ DEF_HELPER_2(dc_zva, void, env, i64)
 DEF_HELPER_FLAGS_2(neon_pmull_64_lo, TCG_CALL_NO_RWG_SE, i64, i64, i64)
 DEF_HELPER_FLAGS_2(neon_pmull_64_hi, TCG_CALL_NO_RWG_SE, i64, i64, i64)
 
+DEF_HELPER_FLAGS_5(gvec_fadd_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_fadd_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_fadd_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(gvec_fsub_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_fsub_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_fsub_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(gvec_fmul_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_fmul_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_fmul_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(gvec_ftsmul_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_ftsmul_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_ftsmul_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
 #ifdef TARGET_AARCH64
 #include "helper-a64.h"
 #include "helper-sve.h"
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index 72abcb543a..f9a3ad1434 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -3109,6 +3109,47 @@ DO_ZZI(UMIN, umin)
 
 #undef DO_ZZI
 
+/*
+ *** SVE Floating Point Arithmetic - Unpredicated Group
+ */
+
+static void do_zzz_fp(DisasContext *s, arg_rrr_esz *a,
+  gen_helper_gvec_3_ptr *fn)
+{
+unsigned vsz = vec_full_reg_size(s);
+TCGv_ptr status;
+
+if (fn == NULL) {
+unallocated_encoding(s);
+return;
+}
+status = get_fpstatus_ptr(a->esz == MO_16);
+tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, a->rd),
+   vec_full_reg_offset(s, a->rn),
+   vec_full_reg_offset(s, a->rm),
+   status, vsz, vsz, 0, fn);
+}
+
+
+#define DO_FP3(NAME, name) \
+static void trans_##NAME(DisasContext *s, arg_rrr_esz *a, uint32_t insn) \
+{   \
+static gen_helper_gvec_3_ptr * const fns[4] = { \
+NULL, gen_helper_gvec_##name##_h,   \
+gen_helper_gvec_##name##_s, gen_helper_gvec_##name##_d  \
+};  \
+do_zzz_fp(s, a, fns[a->esz]);   \
+}
+
+DO_FP3(FADD_zzz, fadd)
+DO_FP3(FSUB_zzz, fsub)
+DO_FP3(FMUL_zzz, fmul)
+DO_FP3(FTSMUL, ftsmul)
+DO_FP3(FRECPS, recps)
+DO_FP3(FRSQRTS, rsqrts)
+
+#undef DO_FP3
+
 /*
  *** SVE Memory - 32-bit Gather and Unsized Contiguous Group
  */
diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
new file mode 100644
index 00..ad5c29cdd5
--- /dev/null
+++ b/target/arm/vec_helper.c
@@ -0,0 +1,94 @@
+/*
+ * ARM Shared AdvSIMD / SVE Operations
+ *
+ * Copyright (c) 2018 Linaro
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as pu

[Qemu-devel] [PATCH v2 21/67] target/arm: Implement SVE floating-point exponential accelerator

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h|  4 +++
 target/arm/sve_helper.c| 81 ++
 target/arm/translate-sve.c | 22 +
 target/arm/sve.decode  |  7 
 4 files changed, 114 insertions(+)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index 5280d375f9..e2925ff8ec 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -385,6 +385,10 @@ DEF_HELPER_FLAGS_4(sve_adr_p64, TCG_CALL_NO_RWG, void, 
ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(sve_adr_s32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(sve_adr_u32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_3(sve_fexpa_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_fexpa_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_fexpa_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
 DEF_HELPER_FLAGS_5(sve_and_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
 DEF_HELPER_FLAGS_5(sve_bic_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
 DEF_HELPER_FLAGS_5(sve_eor_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index a290a58c02..4d42653eef 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -1101,3 +1101,84 @@ void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, 
uint32_t desc)
 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
 }
 }
+
+void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
+{
+static const uint16_t coeff[] = {
+0x, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
+0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
+0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
+0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
+};
+intptr_t i, opr_sz = simd_oprsz(desc) / 2;
+uint16_t *d = vd, *n = vn;
+
+for (i = 0; i < opr_sz; i++) {
+uint16_t nn = n[i];
+intptr_t idx = extract32(nn, 0, 5);
+uint16_t exp = extract32(nn, 5, 5);
+d[i] = coeff[idx] | (exp << 10);
+}
+}
+
+void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
+{
+static const uint32_t coeff[] = {
+0x00, 0x0164d2, 0x02cd87, 0x043a29,
+0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
+0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
+0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
+0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
+0x1ef532, 0x20b051, 0x227043, 0x243516,
+0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
+0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
+0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
+0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
+0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
+0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
+0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
+0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
+0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
+0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
+};
+intptr_t i, opr_sz = simd_oprsz(desc) / 4;
+uint32_t *d = vd, *n = vn;
+
+for (i = 0; i < opr_sz; i++) {
+uint32_t nn = n[i];
+intptr_t idx = extract32(nn, 0, 6);
+uint32_t exp = extract32(nn, 6, 8);
+d[i] = coeff[idx] | (exp << 23);
+}
+}
+
+void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
+{
+static const uint64_t coeff[] = {
+0x0, 0x02C9A3E778061, 0x059B0D3158574, 0x0874518759BC8,
+0x0B5586CF9890F, 0x0E3EC32D3D1A2, 0x11301D0125B51, 0x1429AAEA92DE0,
+0x172B83C7D517B, 0x1A35BEB6FCB75, 0x1D4873168B9AA, 0x2063B88628CD6,
+0x2387A6E756238, 0x26B4565E27CDD, 0x29E9DF51FDEE1, 0x2D285A6E4030B,
+0x306FE0A31B715, 0x33C08B26416FF, 0x371A7373AA9CB, 0x3A7DB34E59FF7,
+0x3DEA64C123422, 0x4160A21F72E2A, 0x44E086061892D, 0x486A2B5C13CD0,
+0x4BFDAD5362A27, 0x4F9B2769D2CA7, 0x5342B569D4F82, 0x56F4736B527DA,
+0x5AB07DD485429, 0x5E76F15AD2148, 0x6247EB03A5585, 0x6623882552225,
+0x6A09E667F3BCD, 0x6DFB23C651A2F, 0x71F75E8EC5F74, 0x75FEB564267C9,
+0x7A11473EB0187, 0x7E2F336CF4E62, 0x82589994CCE13, 0x868D99B4492ED,
+0x8ACE5422AA0DB, 0x8F1AE99157736, 0x93737B0CDC5E5, 0x97D829FDE4E50,
+0x9C49182A3F090, 0xA0C667B5DE565, 0xA5503B23E255D, 0xA9E6B5579FDBF,
+0xAE89F995AD3AD, 0xB33A2B84F15FB, 0xB7F76F2FB5E47, 0xBCC1E904BC1D2,
+0xC199BDD85529C, 0xC67F12E57D14B, 0xCB720DCEF9069, 0xD072D4A07897C,
+0xD5818DCFBA487, 0xDA9E603DB3285, 0xDFC97337B9B5F, 0xE502EE78B3FF6,
+0xEA4AFA2A490DA, 0xEFA1BEE615A27, 0xF50765B6E4540, 0xFA7C1819E90D8,
+};
+intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+uint64_t *d = vd, *n = vn;
+
+for (i = 0; i < opr_sz; i++) {
+uint64_t nn = n[i];
+intptr_t idx = extract32(nn, 0, 6);
+uint64_t exp = extract32(nn, 6, 11);
+d[i] = coeff[idx] | (exp << 52);
+}
+}
diff --git a/

[Qemu-devel] [PATCH v2 23/67] target/arm: Implement SVE Element Count Group

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h|  11 ++
 target/arm/sve_helper.c| 136 ++
 target/arm/translate-sve.c | 274 -
 target/arm/sve.decode  |  30 -
 4 files changed, 448 insertions(+), 3 deletions(-)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index 4f1bd5a62f..2831e1643b 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -393,6 +393,17 @@ DEF_HELPER_FLAGS_4(sve_ftssel_h, TCG_CALL_NO_RWG, void, 
ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(sve_ftssel_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(sve_ftssel_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_4(sve_sqaddi_b, TCG_CALL_NO_RWG, void, ptr, ptr, s32, i32)
+DEF_HELPER_FLAGS_4(sve_sqaddi_h, TCG_CALL_NO_RWG, void, ptr, ptr, s32, i32)
+DEF_HELPER_FLAGS_4(sve_sqaddi_s, TCG_CALL_NO_RWG, void, ptr, ptr, s64, i32)
+DEF_HELPER_FLAGS_4(sve_sqaddi_d, TCG_CALL_NO_RWG, void, ptr, ptr, s64, i32)
+
+DEF_HELPER_FLAGS_4(sve_uqaddi_b, TCG_CALL_NO_RWG, void, ptr, ptr, s32, i32)
+DEF_HELPER_FLAGS_4(sve_uqaddi_h, TCG_CALL_NO_RWG, void, ptr, ptr, s32, i32)
+DEF_HELPER_FLAGS_4(sve_uqaddi_s, TCG_CALL_NO_RWG, void, ptr, ptr, s64, i32)
+DEF_HELPER_FLAGS_4(sve_uqaddi_d, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(sve_uqsubi_d, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+
 DEF_HELPER_FLAGS_5(sve_and_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
 DEF_HELPER_FLAGS_5(sve_bic_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
 DEF_HELPER_FLAGS_5(sve_eor_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index b4f70af23f..cfda16d520 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -1225,3 +1225,139 @@ void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, 
uint32_t desc)
 d[i] = nn ^ (mm & 2) << 62;
 }
 }
+
+/*
+ * Signed saturating addition with scalar operand.
+ */
+
+void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
+{
+intptr_t i, oprsz = simd_oprsz(desc);
+
+for (i = 0; i < oprsz; i += sizeof(int8_t)) {
+int r = *(int8_t *)(a + i) + b;
+if (r > INT8_MAX) {
+r = INT8_MAX;
+} else if (r < INT8_MIN) {
+r = INT8_MIN;
+}
+*(int8_t *)(d + i) = r;
+}
+}
+
+void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
+{
+intptr_t i, oprsz = simd_oprsz(desc);
+
+for (i = 0; i < oprsz; i += sizeof(int16_t)) {
+int r = *(int16_t *)(a + i) + b;
+if (r > INT16_MAX) {
+r = INT16_MAX;
+} else if (r < INT16_MIN) {
+r = INT16_MIN;
+}
+*(int16_t *)(d + i) = r;
+}
+}
+
+void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
+{
+intptr_t i, oprsz = simd_oprsz(desc);
+
+for (i = 0; i < oprsz; i += sizeof(int32_t)) {
+int64_t r = *(int32_t *)(a + i) + b;
+if (r > INT32_MAX) {
+r = INT32_MAX;
+} else if (r < INT32_MIN) {
+r = INT32_MIN;
+}
+*(int32_t *)(d + i) = r;
+}
+}
+
+void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
+{
+intptr_t i, oprsz = simd_oprsz(desc);
+
+for (i = 0; i < oprsz; i += sizeof(int64_t)) {
+int64_t ai = *(int64_t *)(a + i);
+int64_t r = ai + b;
+if (((r ^ ai) & ~(ai ^ b)) < 0) {
+/* Signed overflow.  */
+r = (r < 0 ? INT64_MAX : INT64_MIN);
+}
+*(int64_t *)(d + i) = r;
+}
+}
+
+/*
+ * Unsigned saturating addition with scalar operand.
+ */
+
+void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
+{
+intptr_t i, oprsz = simd_oprsz(desc);
+
+for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
+int r = *(uint8_t *)(a + i) + b;
+if (r > UINT8_MAX) {
+r = UINT8_MAX;
+} else if (r < 0) {
+r = 0;
+}
+*(uint8_t *)(d + i) = r;
+}
+}
+
+void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
+{
+intptr_t i, oprsz = simd_oprsz(desc);
+
+for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
+int r = *(uint16_t *)(a + i) + b;
+if (r > UINT16_MAX) {
+r = UINT16_MAX;
+} else if (r < 0) {
+r = 0;
+}
+*(uint16_t *)(d + i) = r;
+}
+}
+
+void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
+{
+intptr_t i, oprsz = simd_oprsz(desc);
+
+for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
+int64_t r = *(uint32_t *)(a + i) + b;
+if (r > UINT32_MAX) {
+r = UINT32_MAX;
+} else if (r < 0) {
+r = 0;
+}
+*(uint32_t *)(d + i) = r;
+}
+}
+
+void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
+{
+intptr_t i, oprsz = simd_oprsz(desc);
+
+for (i = 0; i < oprsz; i +=

[Qemu-devel] [PATCH v2 28/67] target/arm: Implement SVE Permute - Predicates Group

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h|   6 +
 target/arm/sve_helper.c| 280 +
 target/arm/translate-sve.c | 110 ++
 target/arm/sve.decode  |  18 +++
 4 files changed, 414 insertions(+)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index 0c9aad575e..ff958fcebd 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -439,6 +439,12 @@ DEF_HELPER_FLAGS_3(sve_uunpk_h, TCG_CALL_NO_RWG, void, 
ptr, ptr, i32)
 DEF_HELPER_FLAGS_3(sve_uunpk_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
 DEF_HELPER_FLAGS_3(sve_uunpk_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_4(sve_zip_p, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_uzp_p, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_trn_p, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_rev_p, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_punpk_p, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
 DEF_HELPER_FLAGS_5(sve_and_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
 DEF_HELPER_FLAGS_5(sve_bic_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
 DEF_HELPER_FLAGS_5(sve_eor_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index 466a209c1e..c3a2706a16 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -1664,3 +1664,283 @@ DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, , H4)
 
 #undef DO_UNPK
+
+static const uint64_t expand_bit_data[5][2] = {
+{ 0xull, 0xull },
+{ 0x0303030303030303ull, 0x0c0c0c0c0c0c0c0cull },
+{ 0x000f000f000f000full, 0x00f000f000f000f0ull },
+{ 0x00ff00ffull, 0xff00ff00ull },
+{ 0xull, 0xull }
+};
+
+/* Expand units of 2**N bits to units of 2**(N+1) bits,
+   with the higher bits zero.  */
+static uint64_t expand_bits(uint64_t x, int n)
+{
+int i, sh;
+for (i = 4, sh = 16; i >= n; i--, sh >>= 1) {
+x = ((x & expand_bit_data[i][1]) << sh) | (x & expand_bit_data[i][0]);
+}
+return x;
+}
+
+/* Compress units of 2**(N+1) bits to units of 2**N bits.  */
+static uint64_t compress_bits(uint64_t x, int n)
+{
+int i, sh;
+for (i = n, sh = 1 << n; i <= 4; i++, sh <<= 1) {
+x = ((x >> sh) & expand_bit_data[i][1]) | (x & expand_bit_data[i][0]);
+}
+return x;
+}
+
+void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
+{
+intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
+int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
+intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
+uint64_t *d = vd;
+intptr_t i;
+
+if (oprsz <= 8) {
+uint64_t nn = *(uint64_t *)vn;
+uint64_t mm = *(uint64_t *)vm;
+int half = 4 * oprsz;
+
+nn = extract64(nn, high * half, half);
+mm = extract64(mm, high * half, half);
+nn = expand_bits(nn, esz);
+mm = expand_bits(mm, esz);
+d[0] = nn + (mm << (1 << esz));
+} else {
+ARMPredicateReg tmp_n, tmp_m;
+
+/* We produce output faster than we consume input.
+   Therefore we must be mindful of possible overlap.  */
+if ((vn - vd) < (uintptr_t)oprsz) {
+vn = memcpy(&tmp_n, vn, oprsz);
+}
+if ((vm - vd) < (uintptr_t)oprsz) {
+vm = memcpy(&tmp_m, vm, oprsz);
+}
+if (high) {
+high = oprsz >> 1;
+}
+
+if ((high & 3) == 0) {
+uint32_t *n = vn, *m = vm;
+high >>= 2;
+
+for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
+uint64_t nn = n[H4(high + i)];
+uint64_t mm = m[H4(high + i)];
+
+nn = expand_bits(nn, esz);
+mm = expand_bits(mm, esz);
+d[i] = nn + (mm << (1 << esz));
+}
+} else {
+uint8_t *n = vn, *m = vm;
+uint16_t *d16 = vd;
+
+for (i = 0; i < oprsz / 2; i++) {
+uint16_t nn = n[H1(high + i)];
+uint16_t mm = m[H1(high + i)];
+
+nn = expand_bits(nn, esz);
+mm = expand_bits(mm, esz);
+d16[H2(i)] = nn + (mm << (1 << esz));
+}
+}
+}
+}
+
+void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
+{
+intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
+int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
+int odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1) << esz;
+uint64_t *d = vd, *n = vn, *m = vm;
+uint64_t l, h;
+intptr_t i;
+
+if (oprsz <= 8) {
+l = compress_bits(n[0] >> odd, esz);
+h = compress_bits(m[0] >> odd, esz);
+d[0] = extract64(l + (h << (4 * oprsz)), 0, 8 * oprsz);
+} 

[Qemu-devel] [PATCH v2 47/67] target/arm: Implement SVE integer convert to floating-point

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h| 30 +++
 target/arm/sve_helper.c| 52 ++
 target/arm/translate-sve.c | 92 ++
 target/arm/sve.decode  | 22 +++
 4 files changed, 196 insertions(+)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index 74c2d642a3..fb7609f9ef 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -720,6 +720,36 @@ DEF_HELPER_FLAGS_5(gvec_rsqrts_s, TCG_CALL_NO_RWG,
 DEF_HELPER_FLAGS_5(gvec_rsqrts_d, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_5(sve_scvt_hh, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_scvt_sh, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_scvt_dh, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_scvt_ss, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_scvt_sd, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_scvt_ds, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_scvt_dd, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve_ucvt_hh, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_ucvt_sh, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_ucvt_dh, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_ucvt_ss, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_ucvt_sd, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_ucvt_ds, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_ucvt_dd, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
 DEF_HELPER_FLAGS_4(sve_ld1bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
 DEF_HELPER_FLAGS_4(sve_ld2bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
 DEF_HELPER_FLAGS_4(sve_ld3bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index e259e910de..a1e0ceb5fb 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -2789,6 +2789,58 @@ uint32_t HELPER(sve_while)(void *vd, uint32_t count, 
uint32_t pred_desc)
 return predtest_ones(d, oprsz, esz_mask);
 }
 
+/* Fully general two-operand expander, controlled by a predicate,
+ * With the extra float_status parameter.
+ */
+#define DO_ZPZ_FP(NAME, TYPE, H, OP)\
+void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
+{   \
+intptr_t i, opr_sz = simd_oprsz(desc);  \
+for (i = 0; i < opr_sz; ) { \
+uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
+do {\
+if (pg & 1) {   \
+TYPE nn = *(TYPE *)(vn + H(i)); \
+*(TYPE *)(vd + H(i)) = OP(nn, status);  \
+}   \
+i += sizeof(TYPE), pg >>= sizeof(TYPE); \
+} while (i & 15);   \
+}   \
+}
+
+/* Similarly, specialized for 64-bit operands.  */
+#define DO_ZPZ_FP_D(NAME, TYPE, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
+{   \
+intptr_t i, opr_sz = simd_oprsz(desc) / 8;  \
+TYPE *d = vd, *n = vn;  \
+uint8_t *pg = vg;   \
+for (i = 0; i < opr_sz; i += 1) {   \
+if (pg[H1(i)] & 1) {\
+d[i] = OP(n[i], status);\
+}   \
+}   \
+}
+
+DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
+DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
+DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
+DO_ZPZ_FP_D(sve_scvt_sd, uint64_t, int32_to_float64)
+DO_ZPZ_FP_D(sve_scvt_dh, uint64_t, int64_to_float16)
+DO_ZPZ_FP_D(sve_scvt_ds, uint64_t, int64_to_float32)
+DO_ZPZ_FP_D(sve_scvt_dd, uint64_t, int64_to_float64)
+
+DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
+DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
+DO_ZPZ_FP(sve_ucv

[Qemu-devel] [PATCH v2 25/67] target/arm: Implement SVE Integer Wide Immediate - Predicated Group

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h|  10 +
 target/arm/sve_helper.c| 108 +
 target/arm/translate-sve.c |  92 ++
 target/arm/sve.decode  |  17 +++
 4 files changed, 227 insertions(+)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index 2831e1643b..79493ab647 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -404,6 +404,16 @@ DEF_HELPER_FLAGS_4(sve_uqaddi_s, TCG_CALL_NO_RWG, void, 
ptr, ptr, s64, i32)
 DEF_HELPER_FLAGS_4(sve_uqaddi_d, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
 DEF_HELPER_FLAGS_4(sve_uqsubi_d, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
 
+DEF_HELPER_FLAGS_5(sve_cpy_m_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_5(sve_cpy_m_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_5(sve_cpy_m_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_5(sve_cpy_m_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i64, i32)
+
+DEF_HELPER_FLAGS_4(sve_cpy_z_b, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(sve_cpy_z_h, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(sve_cpy_z_s, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(sve_cpy_z_d, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+
 DEF_HELPER_FLAGS_5(sve_and_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
 DEF_HELPER_FLAGS_5(sve_bic_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
 DEF_HELPER_FLAGS_5(sve_eor_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index cfda16d520..6a95d1ec48 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -1361,3 +1361,111 @@ void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, 
uint32_t desc)
 *(uint64_t *)(d + i) = (ai < b ? 0 : ai - b);
 }
 }
+
+/* Two operand predicated copy immediate with merge.  All valid immediates
+ * can fit within 17 signed bits in the simd_data field.
+ */
+void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
+ uint64_t mm, uint32_t desc)
+{
+intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+uint64_t *d = vd, *n = vn;
+uint8_t *pg = vg;
+
+mm = (mm & 0xff) * (-1ull / 0xff);
+for (i = 0; i < opr_sz; i += 1) {
+uint64_t nn = n[i];
+uint64_t pp = expand_pred_b(pg[H1(i)]);
+d[i] = (mm & pp) | (nn & ~pp);
+}
+}
+
+void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
+ uint64_t mm, uint32_t desc)
+{
+intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+uint64_t *d = vd, *n = vn;
+uint8_t *pg = vg;
+
+mm = (mm & 0x) * (-1ull / 0x);
+for (i = 0; i < opr_sz; i += 1) {
+uint64_t nn = n[i];
+uint64_t pp = expand_pred_h(pg[H1(i)]);
+d[i] = (mm & pp) | (nn & ~pp);
+}
+}
+
+void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
+ uint64_t mm, uint32_t desc)
+{
+intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+uint64_t *d = vd, *n = vn;
+uint8_t *pg = vg;
+
+mm = deposit64(mm, 32, 32, mm);
+for (i = 0; i < opr_sz; i += 1) {
+uint64_t nn = n[i];
+uint64_t pp = expand_pred_s(pg[H1(i)]);
+d[i] = (mm & pp) | (nn & ~pp);
+}
+}
+
+void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
+ uint64_t mm, uint32_t desc)
+{
+intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+uint64_t *d = vd, *n = vn;
+uint8_t *pg = vg;
+
+for (i = 0; i < opr_sz; i += 1) {
+uint64_t nn = n[i];
+d[i] = (pg[H1(i)] & 1 ? mm : nn);
+}
+}
+
+void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
+{
+intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+uint64_t *d = vd;
+uint8_t *pg = vg;
+
+val = (val & 0xff) * (-1ull / 0xff);
+for (i = 0; i < opr_sz; i += 1) {
+d[i] = val & expand_pred_b(pg[H1(i)]);
+}
+}
+
+void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
+{
+intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+uint64_t *d = vd;
+uint8_t *pg = vg;
+
+val = (val & 0x) * (-1ull / 0x);
+for (i = 0; i < opr_sz; i += 1) {
+d[i] = val & expand_pred_h(pg[H1(i)]);
+}
+}
+
+void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
+{
+intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+uint64_t *d = vd;
+uint8_t *pg = vg;
+
+val = deposit64(val, 32, 32, val);
+for (i = 0; i < opr_sz; i += 1) {
+d[i] = val & expand_pred_s(pg[H1(i)]);
+}
+}
+
+void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
+{
+intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+uint64_t *d = vd;
+uint8_t *pg = vg;
+
+for (i = 0; i < opr_sz; i += 1) {
+d[i] = (pg[H1(i)] & 1 ? val : 0);
+}
+}
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index 21b1e4df85..dd085b084b 

[Qemu-devel] [PATCH v2 22/67] target/arm: Implement SVE floating-point trig select coefficient

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h|  4 
 target/arm/sve_helper.c| 43 +++
 target/arm/translate-sve.c | 19 +++
 target/arm/sve.decode  |  4 
 4 files changed, 70 insertions(+)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index e2925ff8ec..4f1bd5a62f 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -389,6 +389,10 @@ DEF_HELPER_FLAGS_3(sve_fexpa_h, TCG_CALL_NO_RWG, void, 
ptr, ptr, i32)
 DEF_HELPER_FLAGS_3(sve_fexpa_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
 DEF_HELPER_FLAGS_3(sve_fexpa_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_4(sve_ftssel_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_ftssel_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_ftssel_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
 DEF_HELPER_FLAGS_5(sve_and_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
 DEF_HELPER_FLAGS_5(sve_bic_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
 DEF_HELPER_FLAGS_5(sve_eor_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index 4d42653eef..b4f70af23f 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -23,6 +23,7 @@
 #include "exec/cpu_ldst.h"
 #include "exec/helper-proto.h"
 #include "tcg/tcg-gvec-desc.h"
+#include "fpu/softfloat.h"
 
 
 /* Note that vector data is stored in host-endian 64-bit chunks,
@@ -1182,3 +1183,45 @@ void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t 
desc)
 d[i] = coeff[idx] | (exp << 52);
 }
 }
+
+void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+intptr_t i, opr_sz = simd_oprsz(desc) / 2;
+uint16_t *d = vd, *n = vn, *m = vm;
+for (i = 0; i < opr_sz; i += 1) {
+uint16_t nn = n[i];
+uint16_t mm = m[i];
+if (mm & 1) {
+nn = float16_one;
+}
+d[i] = nn ^ (mm & 2) << 14;
+}
+}
+
+void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+intptr_t i, opr_sz = simd_oprsz(desc) / 4;
+uint32_t *d = vd, *n = vn, *m = vm;
+for (i = 0; i < opr_sz; i += 1) {
+uint32_t nn = n[i];
+uint32_t mm = m[i];
+if (mm & 1) {
+nn = float32_one;
+}
+d[i] = nn ^ (mm & 2) << 30;
+}
+}
+
+void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+uint64_t *d = vd, *n = vn, *m = vm;
+for (i = 0; i < opr_sz; i += 1) {
+uint64_t nn = n[i];
+uint64_t mm = m[i];
+if (mm & 1) {
+nn = float64_one;
+}
+d[i] = nn ^ (mm & 2) << 62;
+}
+}
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index 2f23f1b192..e32be385fd 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -902,6 +902,25 @@ static void trans_FEXPA(DisasContext *s, arg_rr_esz *a, 
uint32_t insn)
vsz, vsz, 0, fns[a->esz]);
 }
 
+static void trans_FTSSEL(DisasContext *s, arg_rrr_esz *a, uint32_t insn)
+{
+static gen_helper_gvec_3 * const fns[4] = {
+NULL,
+gen_helper_sve_ftssel_h,
+gen_helper_sve_ftssel_s,
+gen_helper_sve_ftssel_d,
+};
+unsigned vsz = vec_full_reg_size(s);
+if (a->esz == 0) {
+unallocated_encoding(s);
+return;
+}
+tcg_gen_gvec_3_ool(vec_full_reg_offset(s, a->rd),
+   vec_full_reg_offset(s, a->rn),
+   vec_full_reg_offset(s, a->rm),
+   vsz, vsz, 0, fns[a->esz]);
+}
+
 /*
  *** SVE Predicate Logical Operations Group
  */
diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index e791fe8031..4ea3f33919 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -297,6 +297,10 @@ ADR_p640100 11 1 . 1010 .. . . 
@rd_rn_msz_rm
 # Note esz != 0
 FEXPA  0100 .. 1 0 101110 . .  @rd_rn
 
+# SVE floating-point trig select coefficient
+# Note esz != 0
+FTSSEL 0100 .. 1 . 101100 . .  @rd_rn_rm
+
 ### SVE Predicate Logical Operations Group
 
 # SVE predicate logical operations
-- 
2.14.3




[Qemu-devel] [PATCH v2 41/67] target/arm: Implement FDUP/DUP

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/translate-sve.c | 35 +++
 target/arm/sve.decode  |  8 
 2 files changed, 43 insertions(+)

diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index 4b92a55c21..7571d02237 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -2939,6 +2939,41 @@ static void trans_WHILE(DisasContext *s, arg_WHILE *a, 
uint32_t insn)
 tcg_temp_free_i32(t3);
 }
 
+/*
+ *** SVE Integer Wide Immediate - Unpredicated Group
+ */
+
+static void trans_FDUP(DisasContext *s, arg_FDUP *a, uint32_t insn)
+{
+unsigned vsz = vec_full_reg_size(s);
+int dofs = vec_full_reg_offset(s, a->rd);
+uint64_t imm;
+
+if (a->esz == 0) {
+unallocated_encoding(s);
+return;
+}
+
+/* Decode the VFP immediate.  */
+imm = vfp_expand_imm(a->esz, a->imm);
+imm = dup_const(a->esz, imm);
+
+tcg_gen_gvec_dup64i(dofs, vsz, vsz, imm);
+}
+
+static void trans_DUP_i(DisasContext *s, arg_DUP_i *a, uint32_t insn)
+{
+unsigned vsz = vec_full_reg_size(s);
+int dofs = vec_full_reg_offset(s, a->rd);
+
+if (a->esz == 0 && extract32(insn, 13, 1)) {
+unallocated_encoding(s);
+return;
+}
+
+tcg_gen_gvec_dup64i(dofs, vsz, vsz, dup_const(a->esz, a->imm));
+}
+
 /*
  *** SVE Memory - 32-bit Gather and Unsized Contiguous Group
  */
diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index b5bc7e9546..ea1bfe7579 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -622,6 +622,14 @@ CTERM  00100101 1 sf:1 1 rm:5 001000 rn:5 ne:1 

 # SVE integer compare scalar count and limit
 WHILE  00100101 esz:2 1 rm:5 000 sf:1 u:1 1 rn:5 eq:1 rd:4
 
+### SVE Integer Wide Immediate - Unpredicated Group
+
+# SVE broadcast floating-point immediate (unpredicated)
+FDUP   00100101 esz:2 111 00 1110 imm:8 rd:5
+
+# SVE broadcast integer immediate (unpredicated)
+DUP_i  00100101 esz:2 111 00 011 .  rd:5   imm=%sh8_i8s
+
 ### SVE Memory - 32-bit Gather and Unsized Contiguous Group
 
 # SVE load predicate register
-- 
2.14.3




[Qemu-devel] [PATCH v2 48/67] target/arm: Implement SVE floating-point arithmetic (predicated)

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h|  77 
 target/arm/sve_helper.c| 107 +
 target/arm/translate-sve.c |  47 
 target/arm/sve.decode  |  17 +++
 4 files changed, 248 insertions(+)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index fb7609f9ef..84d0a8978c 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -720,6 +720,83 @@ DEF_HELPER_FLAGS_5(gvec_rsqrts_s, TCG_CALL_NO_RWG,
 DEF_HELPER_FLAGS_5(gvec_rsqrts_d, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_6(sve_fadd_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fadd_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fadd_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_6(sve_fsub_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fsub_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fsub_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_6(sve_fmul_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fmul_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fmul_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_6(sve_fdiv_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fdiv_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fdiv_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_6(sve_fmin_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fmin_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fmin_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_6(sve_fmax_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fmax_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fmax_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_6(sve_fminnum_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fminnum_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fminnum_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_6(sve_fmaxnum_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fmaxnum_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fmaxnum_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_6(sve_fabd_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fabd_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fabd_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_6(sve_fscalbn_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fscalbn_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fscalbn_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_6(sve_fmulx_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fmulx_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fmulx_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+
 DEF_HELPER_FLAGS_5(sve_scvt_hh, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_5(sve_scvt_sh, TCG_CALL_NO_RWG,
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index a1e0ceb5fb..d80babfae7 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -2789,6 +2789,113 @@ uint32_t HELPER(sve_while)(void *vd, uint32_t count, 
uint32_t pred_desc)
 return predtest_ones(d, oprsz, esz_mask);
 }
 
+/* Fully general three-operand expander, controlled by a predicate,
+ * With the extra float_status parameter.
+ */
+#define DO_ZPZZ_FP(NAME, TYPE, H, OP)   \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,   \
+  void *status, uint32_t desc)  \
+{   

[Qemu-devel] [PATCH v2 24/67] target/arm: Implement SVE Bitwise Immediate Group

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/translate-sve.c | 50 ++
 target/arm/sve.decode  | 17 
 2 files changed, 67 insertions(+)

diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index 702f20e97b..21b1e4df85 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -34,6 +34,8 @@
 #include "translate-a64.h"
 
 typedef void GVecGen2Fn(unsigned, uint32_t, uint32_t, uint32_t, uint32_t);
+typedef void GVecGen2iFn(unsigned, uint32_t, uint32_t,
+ int64_t, uint32_t, uint32_t);
 typedef void GVecGen3Fn(unsigned, uint32_t, uint32_t,
 uint32_t, uint32_t, uint32_t);
 
@@ -1648,6 +1650,54 @@ static void trans_SINCDEC_v(DisasContext *s, 
arg_incdec2_cnt *a,
 }
 }
 
+/*
+ *** SVE Bitwise Immediate Group
+ */
+
+static void do_zz_dbm(DisasContext *s, arg_rr_dbm *a, GVecGen2iFn *gvec_fn)
+{
+unsigned vsz;
+uint64_t imm;
+
+if (!logic_imm_decode_wmask(&imm, extract32(a->dbm, 12, 1),
+extract32(a->dbm, 0, 6),
+extract32(a->dbm, 6, 6))) {
+unallocated_encoding(s);
+return;
+}
+
+vsz = vec_full_reg_size(s);
+gvec_fn(MO_64, vec_full_reg_offset(s, a->rd),
+vec_full_reg_offset(s, a->rn), imm, vsz, vsz);
+}
+
+static void trans_AND_zzi(DisasContext *s, arg_rr_dbm *a, uint32_t insn)
+{
+do_zz_dbm(s, a, tcg_gen_gvec_andi);
+}
+
+static void trans_ORR_zzi(DisasContext *s, arg_rr_dbm *a, uint32_t insn)
+{
+do_zz_dbm(s, a, tcg_gen_gvec_ori);
+}
+
+static void trans_EOR_zzi(DisasContext *s, arg_rr_dbm *a, uint32_t insn)
+{
+do_zz_dbm(s, a, tcg_gen_gvec_xori);
+}
+
+static void trans_DUPM(DisasContext *s, arg_DUPM *a, uint32_t insn)
+{
+uint64_t imm;
+if (!logic_imm_decode_wmask(&imm, extract32(a->dbm, 12, 1),
+extract32(a->dbm, 0, 6),
+extract32(a->dbm, 6, 6))) {
+unallocated_encoding(s);
+return;
+}
+do_dupi_z(s, a->rd, imm);
+}
+
 /*
  *** SVE Memory - 32-bit Gather and Unsized Contiguous Group
  */
diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index 5690b5fcb9..0990d135f4 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -50,6 +50,7 @@
 
 &rr_eszrd rn esz
 &rri   rd rn imm
+&rr_dbmrd rn dbm
 &rrri  rd rn rm imm
 &rri_esz   rd rn imm esz
 &rrr_esz   rd rn rm esz
@@ -112,6 +113,10 @@
 @rd_rn_tszimm   .. ... ... .. rn:5 rd:5 \
&rri_esz esz=%tszimm16_esz
 
+# Two register operand, one encoded bitmask.
+@rdn_dbm    ..  dbm:13 rd:5 \
+   &rr_dbm rn=%reg_movprfx
+
 # Basic Load/Store with 9-bit immediate offset
 @pd_rn_i9    .. rn:5 . rd:4\
&rri imm=%imm9_16_10
@@ -331,6 +336,18 @@ INCDEC_v   0100 .. 1 1  1100 0 d:1 . .
@incdec2_cnt u=1
 # Note these require esz != 0.
 SINCDEC_v  0100 .. 1 0  1100 d:1 u:1 . .   @incdec2_cnt
 
+### SVE Bitwise Immediate Group
+
+# SVE bitwise logical with immediate (unpredicated)
+ORR_zzi0101 00  . .@rdn_dbm
+EOR_zzi0101 01  . .@rdn_dbm
+AND_zzi0101 10  . .@rdn_dbm
+
+# SVE broadcast bitmask immediate
+DUPM   0101 11  dbm:13 rd:5
+
+### SVE Predicate Logical Operations Group
+
 # SVE predicate logical operations
 AND_   00100101 0. 00  01  0  0    @pd_pg_pn_pm_s
 BIC_   00100101 0. 00  01  0  1    @pd_pg_pn_pm_s
-- 
2.14.3




[Qemu-devel] [PATCH v2 32/67] target/arm: Implement SVE copy to vector (predicated)

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/translate-sve.c | 13 +
 target/arm/sve.decode  |  6 ++
 2 files changed, 19 insertions(+)

diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index 207a22a0bc..fc2a295ab7 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -2422,6 +2422,19 @@ static void trans_LASTB_r(DisasContext *s, arg_rpr_esz 
*a, uint32_t insn)
 do_last_general(s, a, true);
 }
 
+static void trans_CPY_m_r(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+do_cpy_m(s, a->esz, a->rd, a->rd, a->pg, cpu_reg_sp(s, a->rn));
+}
+
+static void trans_CPY_m_v(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+int ofs = vec_reg_offset(s, a->rn, 0, a->esz);
+TCGv_i64 t = load_esz(cpu_env, ofs, a->esz);
+do_cpy_m(s, a->esz, a->rd, a->rd, a->pg, t);
+tcg_temp_free_i64(t);
+}
+
 /*
  *** SVE Memory - 32-bit Gather and Unsized Contiguous Group
  */
diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index 1370802c12..5e127de88c 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -451,6 +451,12 @@ LASTB_v0101 .. 10001 1 100 ... . . 
@rd_pg_rn
 LASTA_r0101 .. 1 0 101 ... . . 
@rd_pg_rn
 LASTB_r0101 .. 1 1 101 ... . . 
@rd_pg_rn
 
+# SVE copy element from SIMD&FP scalar register
+CPY_m_v0101 .. 10 100 ... . .  
@rd_pg_rn
+
+# SVE copy element from general register to vector (predicated)
+CPY_m_r0101 .. 101000 101 ... . .  
@rd_pg_rn
+
 ### SVE Predicate Logical Operations Group
 
 # SVE predicate logical operations
-- 
2.14.3




[Qemu-devel] [PATCH v2 44/67] target/arm: Implement SVE Memory Contiguous Load Group

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h|  35 +++
 target/arm/sve_helper.c| 235 +
 target/arm/translate-sve.c | 130 +
 target/arm/sve.decode  |  44 -
 4 files changed, 442 insertions(+), 2 deletions(-)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index 2e76084992..fcc9ba5f50 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -719,3 +719,38 @@ DEF_HELPER_FLAGS_5(gvec_rsqrts_s, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_5(gvec_rsqrts_d, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve_ld1bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld2bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld3bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld4bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_4(sve_ld1hh_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld2hh_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld3hh_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld4hh_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_4(sve_ld1ss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld2ss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld3ss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld4ss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_4(sve_ld1dd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld2dd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld3dd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld4dd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_4(sve_ld1bhu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld1bsu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld1bdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld1bhs_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld1bss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld1bds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_4(sve_ld1hsu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld1hdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld1hss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld1hds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_4(sve_ld1sdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld1sds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index 4f45f11bff..e542725113 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -2788,3 +2788,238 @@ uint32_t HELPER(sve_while)(void *vd, uint32_t count, 
uint32_t pred_desc)
 
 return predtest_ones(d, oprsz, esz_mask);
 }
+
+/*
+ * Load contiguous data, protected by a governing predicate.
+ */
+#define DO_LD1(NAME, FN, TYPEE, TYPEM, H)  \
+void HELPER(NAME)(CPUARMState *env, void *vg,  \
+  target_ulong addr, uint32_t desc)\
+{  \
+intptr_t i, oprsz = simd_oprsz(desc);  \
+intptr_t ra = GETPC(); \
+unsigned rd = simd_data(desc); \
+void *vd = &env->vfp.zregs[rd];\
+for (i = 0; i < oprsz; ) { \
+uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));\
+do {   \
+TYPEM m = 0;   \
+if (pg & 1) {  \
+m = FN(env, addr, ra); \
+}  \
+*(TYPEE *)(vd + H(i)) = m; \
+i += sizeof(TYPEE), pg >>= sizeof(TYPEE);  \
+addr += sizeof(TYPEM); \
+} while (i & 15);  \
+}  \
+}
+
+#define DO_LD1_D(NAME, FN, TYPEM)  \
+void HELPER(NAME)(CPUARMState *env, void *vg,  \
+  target_ulong addr, uint32_t desc)\
+{  \
+intptr_t i, oprsz = simd_oprsz(desc) / 8;  \
+intptr_t ra = GETPC(); \
+unsigned rd = simd_data(desc); \
+uint64_t *d = &env->vfp.zregs[rd].d[0];\
+uint8_t *pg = vg;  \
+for (i = 

[Qemu-devel] [PATCH v2 53/67] target/arm: Implement SVE scatter stores

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h| 41 ++
 target/arm/sve_helper.c| 62 
 target/arm/translate-sve.c | 71 ++
 target/arm/sve.decode  | 39 +
 4 files changed, 213 insertions(+)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index 6c640a92ff..b5c093f2fd 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -918,3 +918,44 @@ DEF_HELPER_FLAGS_4(sve_st1hs_r, TCG_CALL_NO_WG, void, env, 
ptr, tl, i32)
 DEF_HELPER_FLAGS_4(sve_st1hd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
 
 DEF_HELPER_FLAGS_4(sve_st1sd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_6(sve_stbs_zsu, TCG_CALL_NO_WG,
+   void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_sths_zsu, TCG_CALL_NO_WG,
+   void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_stss_zsu, TCG_CALL_NO_WG,
+   void, env, ptr, ptr, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_6(sve_stbs_zss, TCG_CALL_NO_WG,
+   void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_sths_zss, TCG_CALL_NO_WG,
+   void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_stss_zss, TCG_CALL_NO_WG,
+   void, env, ptr, ptr, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_6(sve_stbd_zsu, TCG_CALL_NO_WG,
+   void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_sthd_zsu, TCG_CALL_NO_WG,
+   void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_stsd_zsu, TCG_CALL_NO_WG,
+   void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_stdd_zsu, TCG_CALL_NO_WG,
+   void, env, ptr, ptr, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_6(sve_stbd_zss, TCG_CALL_NO_WG,
+   void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_sthd_zss, TCG_CALL_NO_WG,
+   void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_stsd_zss, TCG_CALL_NO_WG,
+   void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_stdd_zss, TCG_CALL_NO_WG,
+   void, env, ptr, ptr, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_6(sve_stbd_zd, TCG_CALL_NO_WG,
+   void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_sthd_zd, TCG_CALL_NO_WG,
+   void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_stsd_zd, TCG_CALL_NO_WG,
+   void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_stdd_zd, TCG_CALL_NO_WG,
+   void, env, ptr, ptr, ptr, tl, i32)
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index a7dc6f6164..07b3d285f2 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -3545,3 +3545,65 @@ void HELPER(sve_st4dd_r)(CPUARMState *env, void *vg,
 addr += 4 * 8;
 }
 }
+
+/* Stores with a vector index.  */
+
+#define DO_ST1_ZPZ_S(NAME, TYPEI, FN)   \
+void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm,   \
+  target_ulong base, uint32_t desc) \
+{   \
+intptr_t i, oprsz = simd_oprsz(desc) / 8;   \
+unsigned scale = simd_data(desc);   \
+uintptr_t ra = GETPC(); \
+uint32_t *d = vd; TYPEI *m = vm; uint8_t *pg = vg;  \
+for (i = 0; i < oprsz; i++) {   \
+uint8_t pp = pg[H1(i)]; \
+if (pp & 0x01) {\
+target_ulong off = (target_ulong)m[H4(i * 2)] << scale; \
+FN(env, base + off, d[H4(i * 2)], ra);  \
+}   \
+if (pp & 0x10) {\
+target_ulong off = (target_ulong)m[H4(i * 2 + 1)] << scale; \
+FN(env, base + off, d[H4(i * 2 + 1)], ra);  \
+}   \
+}   \
+}
+
+#define DO_ST1_ZPZ_D(NAME, TYPEI, FN)   \
+void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm,   \
+  target_ulong base, uint32_t desc) \
+{   \
+intptr_t i, oprsz = simd_oprsz(desc) / 8;   \
+unsigned scale = simd_data(desc);   \
+uintptr_t ra = GETPC(); \
+uint64_t *d = vd, *m = vm; uint8_t *pg = vg;\
+for (i 

[Qemu-devel] [PATCH v2 34/67] target/arm: Implement SVE vector splice (predicated)

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h|  2 ++
 target/arm/sve_helper.c| 37 +
 target/arm/translate-sve.c | 10 ++
 target/arm/sve.decode  |  3 +++
 4 files changed, 52 insertions(+)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index 3b7c54905d..c3f8a2b502 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -479,6 +479,8 @@ DEF_HELPER_FLAGS_4(sve_rbit_h, TCG_CALL_NO_RWG, void, ptr, 
ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(sve_rbit_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(sve_rbit_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_5(sve_splice, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+
 DEF_HELPER_FLAGS_5(sve_and_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
 DEF_HELPER_FLAGS_5(sve_bic_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
 DEF_HELPER_FLAGS_5(sve_eor_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index a67bb579b8..f524a1ddce 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -2088,3 +2088,40 @@ int32_t HELPER(sve_last_active_element)(void *vg, 
uint32_t pred_desc)
 
 return last_active_element(vg, DIV_ROUND_UP(oprsz, 8), esz);
 }
+
+void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
+{
+intptr_t opr_sz = simd_oprsz(desc) / 8;
+int esz = simd_data(desc);
+uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
+intptr_t i, first_i, last_i;
+ARMVectorReg tmp;
+
+first_i = last_i = 0;
+first_g = last_g = 0;
+
+/* Find the extent of the active elements within VG.  */
+for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
+pg = *(uint64_t *)(vg + i) & mask;
+if (pg) {
+if (last_g == 0) {
+last_g = pg;
+last_i = i;
+}
+first_g = pg;
+first_i = i;
+}
+}
+
+len = 0;
+if (first_g != 0) {
+first_i = first_i * 8 + ctz64(first_g);
+last_i = last_i * 8 + 63 - clz64(last_g);
+len = last_i - first_i + (1 << esz);
+if (vd == vm) {
+vm = memcpy(&tmp, vm, opr_sz * 8);
+}
+swap_memmove(vd, vn + first_i, len);
+}
+swap_memmove(vd + len, vm, opr_sz * 8 - len);
+}
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index 5a1ed379ad..559fb41fd6 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -2473,6 +2473,16 @@ static void trans_RBIT(DisasContext *s, arg_rpr_esz *a, 
uint32_t insn)
 do_zpz_ool(s, a, fns[a->esz]);
 }
 
+static void trans_SPLICE(DisasContext *s, arg_rprr_esz *a, uint32_t insn)
+{
+unsigned vsz = vec_full_reg_size(s);
+tcg_gen_gvec_4_ool(vec_full_reg_offset(s, a->rd),
+   vec_full_reg_offset(s, a->rn),
+   vec_full_reg_offset(s, a->rm),
+   pred_full_reg_offset(s, a->pg),
+   vsz, vsz, a->esz, gen_helper_sve_splice);
+}
+
 /*
  *** SVE Memory - 32-bit Gather and Unsized Contiguous Group
  */
diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index 8903fb6592..70feb448e6 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -464,6 +464,9 @@ REVH0101 .. 1001 01 100 ... . . 
@rd_pg_rn
 REVW   0101 .. 1001 10 100 ... . . @rd_pg_rn
 RBIT   0101 .. 1001 11 100 ... . . @rd_pg_rn
 
+# SVE vector splice (predicated)
+SPLICE 0101 .. 101 100 100 ... . . @rdn_pg_rm
+
 ### SVE Predicate Logical Operations Group
 
 # SVE predicate logical operations
-- 
2.14.3




[Qemu-devel] [PATCH v2 29/67] target/arm: Implement SVE Permute - Interleaving Group

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h| 15 ++
 target/arm/sve_helper.c| 72 ++
 target/arm/translate-sve.c | 69 
 target/arm/sve.decode  | 10 +++
 4 files changed, 166 insertions(+)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index ff958fcebd..bab20345c6 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -445,6 +445,21 @@ DEF_HELPER_FLAGS_4(sve_trn_p, TCG_CALL_NO_RWG, void, ptr, 
ptr, ptr, i32)
 DEF_HELPER_FLAGS_3(sve_rev_p, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
 DEF_HELPER_FLAGS_3(sve_punpk_p, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_4(sve_zip_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_zip_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_zip_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_zip_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve_uzp_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_uzp_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_uzp_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_uzp_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve_trn_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_trn_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_trn_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_trn_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
 DEF_HELPER_FLAGS_5(sve_and_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
 DEF_HELPER_FLAGS_5(sve_bic_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
 DEF_HELPER_FLAGS_5(sve_eor_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index c3a2706a16..62982bd099 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -1944,3 +1944,75 @@ void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t 
pred_desc)
 }
 }
 }
+
+#define DO_ZIP(NAME, TYPE, H) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)   \
+{\
+intptr_t oprsz = simd_oprsz(desc);   \
+intptr_t i, oprsz_2 = oprsz / 2; \
+ARMVectorReg tmp_n, tmp_m;   \
+/* We produce output faster than we consume input.   \
+   Therefore we must be mindful of possible overlap.  */ \
+if (unlikely((vn - vd) < (uintptr_t)oprsz)) {\
+vn = memcpy(&tmp_n, vn, oprsz_2);\
+}\
+if (unlikely((vm - vd) < (uintptr_t)oprsz)) {\
+vm = memcpy(&tmp_m, vm, oprsz_2);\
+}\
+for (i = 0; i < oprsz_2; i += sizeof(TYPE)) {\
+*(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i)); \
+*(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \
+}\
+}
+
+DO_ZIP(sve_zip_b, uint8_t, H1)
+DO_ZIP(sve_zip_h, uint16_t, H1_2)
+DO_ZIP(sve_zip_s, uint32_t, H1_4)
+DO_ZIP(sve_zip_d, uint64_t, )
+
+#define DO_UZP(NAME, TYPE, H) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
+{  \
+intptr_t oprsz = simd_oprsz(desc); \
+intptr_t oprsz_2 = oprsz / 2;  \
+intptr_t odd_ofs = simd_data(desc);\
+intptr_t i;\
+ARMVectorReg tmp_m;\
+if (unlikely((vm - vd) < (uintptr_t)oprsz)) {  \
+vm = memcpy(&tmp_m, vm, oprsz);\
+}  \
+for (i = 0; i < oprsz_2; i += sizeof(TYPE)) {  \
+*(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(2 * i + odd_ofs)); \
+}  \
+for (i = 0; i < oprsz_2; i += sizeof(TYPE)) {  \
+*(TYPE *)(vd + H(oprsz_2 + i)) = *(TYPE *)(vm + H(2 * i + odd_ofs)); \
+}  \
+}
+
+DO_UZP(sve_uzp_b, uint8_t, H1)
+DO_UZP(sve_uzp_h, uint16_t, H1_2)
+DO_UZP(sve_uzp_s, uint32_t, H1_4)
+DO_UZP(sve_uzp_d, uint64_t, )
+
+#define DO_TRN(NAME, TYPE, H) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
+

[Qemu-devel] [PATCH v2 35/67] target/arm: Implement SVE Select Vectors Group

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h|  9 
 target/arm/sve_helper.c| 55 ++
 target/arm/translate-sve.c |  2 ++
 target/arm/sve.decode  |  6 +
 4 files changed, 72 insertions(+)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index c3f8a2b502..0f57f64895 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -195,6 +195,15 @@ DEF_HELPER_FLAGS_5(sve_lsl_zpzz_s, TCG_CALL_NO_RWG,
 DEF_HELPER_FLAGS_5(sve_lsl_zpzz_d, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_5(sve_sel_zpzz_b, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_sel_zpzz_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_sel_zpzz_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_sel_zpzz_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
 DEF_HELPER_FLAGS_5(sve_asr_zpzw_b, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_5(sve_asr_zpzw_h, TCG_CALL_NO_RWG,
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index f524a1ddce..86cd792cdf 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -2125,3 +2125,58 @@ void HELPER(sve_splice)(void *vd, void *vn, void *vm, 
void *vg, uint32_t desc)
 }
 swap_memmove(vd + len, vm, opr_sz * 8 - len);
 }
+
+void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
+void *vg, uint32_t desc)
+{
+intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+uint64_t *d = vd, *n = vn, *m = vm;
+uint8_t *pg = vg;
+
+for (i = 0; i < opr_sz; i += 1) {
+uint64_t nn = n[i], mm = m[i];
+uint64_t pp = expand_pred_b(pg[H1(i)]);
+d[i] = (nn & pp) | (mm & ~pp);
+}
+}
+
+void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
+void *vg, uint32_t desc)
+{
+intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+uint64_t *d = vd, *n = vn, *m = vm;
+uint8_t *pg = vg;
+
+for (i = 0; i < opr_sz; i += 1) {
+uint64_t nn = n[i], mm = m[i];
+uint64_t pp = expand_pred_h(pg[H1(i)]);
+d[i] = (nn & pp) | (mm & ~pp);
+}
+}
+
+void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
+void *vg, uint32_t desc)
+{
+intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+uint64_t *d = vd, *n = vn, *m = vm;
+uint8_t *pg = vg;
+
+for (i = 0; i < opr_sz; i += 1) {
+uint64_t nn = n[i], mm = m[i];
+uint64_t pp = expand_pred_s(pg[H1(i)]);
+d[i] = (nn & pp) | (mm & ~pp);
+}
+}
+
+void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
+void *vg, uint32_t desc)
+{
+intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+uint64_t *d = vd, *n = vn, *m = vm;
+uint8_t *pg = vg;
+
+for (i = 0; i < opr_sz; i += 1) {
+uint64_t nn = n[i], mm = m[i];
+d[i] = (pg[H1(i)] & 1 ? nn : mm);
+}
+}
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index 559fb41fd6..021b33ced9 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -361,6 +361,8 @@ static void trans_UDIV_zpzz(DisasContext *s, arg_rprr_esz 
*a, uint32_t insn)
 do_zpzz_ool(s, a, fns[a->esz]);
 }
 
+DO_ZPZZ(SEL, sel)
+
 #undef DO_ZPZZ
 
 /*
diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index 70feb448e6..7ec84fdd80 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -99,6 +99,7 @@
&rprr_esz rn=%reg_movprfx
 @rdm_pg_rn  esz:2 ... ... ... pg:3 rn:5 rd:5 \
&rprr_esz rm=%reg_movprfx
+@rd_pg4_rn_rm   esz:2 . rm:5  .. pg:4  rn:5 rd:5   &rprr_esz
 
 # Three register operand, with governing predicate, vector element size
 @rda_pg_rn_rm   esz:2 . rm:5  ... pg:3 rn:5 rd:5 \
@@ -467,6 +468,11 @@ RBIT   0101 .. 1001 11 100 ... . . 
@rd_pg_rn
 # SVE vector splice (predicated)
 SPLICE 0101 .. 101 100 100 ... . . @rdn_pg_rm
 
+### SVE Select Vectors Group
+
+# SVE select vector elements (predicated)
+SEL_zpzz   0101 .. 1 . 11  . . @rd_pg4_rn_rm
+
 ### SVE Predicate Logical Operations Group
 
 # SVE predicate logical operations
-- 
2.14.3




[Qemu-devel] [PATCH v2 58/67] target/arm: Implement SVE floating-point arithmetic with immediate

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h| 56 +++
 target/arm/sve_helper.c| 68 ++
 target/arm/translate-sve.c | 73 ++
 target/arm/sve.decode  | 14 +
 4 files changed, 211 insertions(+)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index 30373e3fc7..7ada12687b 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -809,6 +809,62 @@ DEF_HELPER_FLAGS_6(sve_fmulx_s, TCG_CALL_NO_RWG,
 DEF_HELPER_FLAGS_6(sve_fmulx_d, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_6(sve_fadds_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, i64, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fadds_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, i64, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fadds_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, i64, ptr, i32)
+
+DEF_HELPER_FLAGS_6(sve_fsubs_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, i64, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fsubs_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, i64, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fsubs_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, i64, ptr, i32)
+
+DEF_HELPER_FLAGS_6(sve_fmuls_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, i64, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fmuls_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, i64, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fmuls_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, i64, ptr, i32)
+
+DEF_HELPER_FLAGS_6(sve_fsubrs_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, i64, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fsubrs_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, i64, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fsubrs_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, i64, ptr, i32)
+
+DEF_HELPER_FLAGS_6(sve_fmaxnms_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, i64, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fmaxnms_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, i64, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fmaxnms_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, i64, ptr, i32)
+
+DEF_HELPER_FLAGS_6(sve_fminnms_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, i64, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fminnms_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, i64, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fminnms_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, i64, ptr, i32)
+
+DEF_HELPER_FLAGS_6(sve_fmaxs_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, i64, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fmaxs_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, i64, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fmaxs_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, i64, ptr, i32)
+
+DEF_HELPER_FLAGS_6(sve_fmins_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, i64, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fmins_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, i64, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fmins_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, i64, ptr, i32)
+
 DEF_HELPER_FLAGS_5(sve_scvt_hh, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_5(sve_scvt_sh, TCG_CALL_NO_RWG,
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index ace613684d..9378c8f0b2 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -2995,6 +2995,74 @@ DO_ZPZZ_FP_D(sve_fmulx_d, uint64_t, helper_vfp_mulxd)
 #undef DO_ZPZZ_FP
 #undef DO_ZPZZ_FP_D
 
+/* Three-operand expander, with one scalar operand, controlled by
+ * a predicate, with the extra float_status parameter.
+ */
+#define DO_ZPZS_FP(NAME, TYPE, H, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar,  \
+  void *status, uint32_t desc)\
+{ \
+intptr_t i, opr_sz = simd_oprsz(desc);\
+TYPE mm = scalar; \
+for (i = 0; i < opr_sz; ) {   \
+uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));   \
+do {  \
+if (pg & 1) { \
+TYPE nn = *(TYPE *)(vn + H(i));   \
+*(TYPE *)(vd + H(i)) = OP(nn, mm, status);\
+} \
+i += sizeof(TYPE), pg >>= sizeof(TYPE);   \
+} while (i & 15); \
+} \
+}
+
+DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
+DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)

[Qemu-devel] [PATCH v2 30/67] target/arm: Implement SVE compress active elements

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h|  3 +++
 target/arm/sve_helper.c| 34 ++
 target/arm/translate-sve.c | 12 
 target/arm/sve.decode  |  6 ++
 4 files changed, 55 insertions(+)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index bab20345c6..d977aea00d 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -460,6 +460,9 @@ DEF_HELPER_FLAGS_4(sve_trn_h, TCG_CALL_NO_RWG, void, ptr, 
ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(sve_trn_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(sve_trn_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_4(sve_compact_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_compact_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
 DEF_HELPER_FLAGS_5(sve_and_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
 DEF_HELPER_FLAGS_5(sve_bic_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
 DEF_HELPER_FLAGS_5(sve_eor_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index 62982bd099..87a1a32232 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -2016,3 +2016,37 @@ DO_TRN(sve_trn_d, uint64_t, )
 #undef DO_ZIP
 #undef DO_UZP
 #undef DO_TRN
+
+void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
+{
+intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
+uint32_t *d = vd, *n = vn;
+uint8_t *pg = vg;
+
+for (i = j = 0; i < opr_sz; i++) {
+if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
+d[H4(j)] = n[H4(i)];
+j++;
+}
+}
+for (; j < opr_sz; j++) {
+d[H4(j)] = 0;
+}
+}
+
+void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
+{
+intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
+uint64_t *d = vd, *n = vn;
+uint8_t *pg = vg;
+
+for (i = j = 0; i < opr_sz; i++) {
+if (pg[H1(i)] & 1) {
+d[j] = n[i];
+j++;
+}
+}
+for (; j < opr_sz; j++) {
+d[j] = 0;
+}
+}
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index 09ac955a36..21531b259c 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -2111,6 +2111,18 @@ static void trans_TRN2_z(DisasContext *s, arg_rrr_esz 
*a, uint32_t insn)
 do_zzz_data_ool(s, a, 1 << a->esz, trn_fns[a->esz]);
 }
 
+/*
+ *** SVE Permute Vector - Predicated Group
+ */
+
+static void trans_COMPACT(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+static gen_helper_gvec_3 * const fns[4] = {
+NULL, NULL, gen_helper_sve_compact_s, gen_helper_sve_compact_d
+};
+do_zpz_ool(s, a, fns[a->esz]);
+}
+
 /*
  *** SVE Memory - 32-bit Gather and Unsized Contiguous Group
  */
diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index 2efa3773fc..a89bd37eeb 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -425,6 +425,12 @@ UZP2_z 0101 .. 1 . 011 011 . . 
@rd_rn_rm
 TRN1_z 0101 .. 1 . 011 100 . . @rd_rn_rm
 TRN2_z 0101 .. 1 . 011 101 . . @rd_rn_rm
 
+### SVE Permute - Predicated Group
+
+# SVE compress active elements
+# Note esz >= 2
+COMPACT0101 .. 11 100 ... . .  
@rd_pg_rn
+
 ### SVE Predicate Logical Operations Group
 
 # SVE predicate logical operations
-- 
2.14.3




[Qemu-devel] [PATCH v2 52/67] target/arm: Implement SVE store vector/predicate register

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/translate-sve.c | 101 +
 target/arm/sve.decode  |   6 +++
 2 files changed, 107 insertions(+)

diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index b000a2482e..9c724980a0 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -3501,6 +3501,95 @@ static void do_ldr(DisasContext *s, uint32_t vofs, 
uint32_t len,
 tcg_temp_free_i64(t0);
 }
 
+/* Similarly for stores.  */
+static void do_str(DisasContext *s, uint32_t vofs, uint32_t len,
+   int rn, int imm)
+{
+uint32_t len_align = QEMU_ALIGN_DOWN(len, 8);
+uint32_t len_remain = len % 8;
+uint32_t nparts = len / 8 + ctpop8(len_remain);
+int midx = get_mem_index(s);
+TCGv_i64 addr, t0;
+
+addr = tcg_temp_new_i64();
+t0 = tcg_temp_new_i64();
+
+/* Note that unpredicated load/store of vector/predicate registers
+ * are defined as a stream of bytes, which equates to little-endian
+ * operations on larger quantities.  There is no nice way to force
+ * a little-endian load for aarch64_be-linux-user out of line.
+ *
+ * Attempt to keep code expansion to a minimum by limiting the
+ * amount of unrolling done.
+ */
+if (nparts <= 4) {
+int i;
+
+for (i = 0; i < len_align; i += 8) {
+tcg_gen_ld_i64(t0, cpu_env, vofs + i);
+tcg_gen_addi_i64(addr, cpu_reg_sp(s, rn), imm + i);
+tcg_gen_qemu_st_i64(t0, addr, midx, MO_LEQ);
+}
+} else {
+TCGLabel *loop = gen_new_label();
+TCGv_ptr i = TCGV_NAT_TO_PTR(glue(tcg_const_local_, ptr)(0));
+TCGv_ptr src;
+
+gen_set_label(loop);
+
+src = tcg_temp_new_ptr();
+tcg_gen_add_ptr(src, cpu_env, i);
+tcg_gen_ld_i64(t0, src, vofs);
+
+/* Minimize the number of local temps that must be re-read from
+ * the stack each iteration.  Instead, re-compute values other
+ * than the loop counter.
+ */
+tcg_gen_addi_ptr(src, i, imm);
+#if UINTPTR_MAX == UINT32_MAX
+tcg_gen_extu_i32_i64(addr, TCGV_PTR_TO_NAT(src));
+tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, rn));
+#else
+tcg_gen_add_i64(addr, TCGV_PTR_TO_NAT(src), cpu_reg_sp(s, rn));
+#endif
+tcg_temp_free_ptr(src);
+
+tcg_gen_qemu_st_i64(t0, addr, midx, MO_LEQ);
+
+tcg_gen_addi_ptr(i, i, 8);
+
+glue(tcg_gen_brcondi_, ptr)(TCG_COND_LTU, TCGV_PTR_TO_NAT(i),
+   len_align, loop);
+tcg_temp_free_ptr(i);
+}
+
+/* Predicate register stores can be any multiple of 2.  */
+if (len_remain) {
+tcg_gen_ld_i64(t0, cpu_env, vofs + len_align);
+tcg_gen_addi_i64(addr, cpu_reg_sp(s, rn), imm + len_align);
+
+switch (len_remain) {
+case 2:
+case 4:
+case 8:
+tcg_gen_qemu_st_i64(t0, addr, midx, MO_LE | ctz32(len_remain));
+break;
+
+case 6:
+tcg_gen_qemu_st_i64(t0, addr, midx, MO_LEUL);
+tcg_gen_addi_i64(addr, addr, 4);
+tcg_gen_shri_i64(addr, addr, 32);
+tcg_gen_qemu_st_i64(t0, addr, midx, MO_LEUW);
+break;
+
+default:
+g_assert_not_reached();
+}
+}
+tcg_temp_free_i64(addr);
+tcg_temp_free_i64(t0);
+}
+
 #undef ptr
 
 static void trans_LDR_zri(DisasContext *s, arg_rri *a, uint32_t insn)
@@ -3515,6 +3604,18 @@ static void trans_LDR_pri(DisasContext *s, arg_rri *a, 
uint32_t insn)
 do_ldr(s, pred_full_reg_offset(s, a->rd), size, a->rn, a->imm * size);
 }
 
+static void trans_STR_zri(DisasContext *s, arg_rri *a, uint32_t insn)
+{
+int size = vec_full_reg_size(s);
+do_str(s, vec_full_reg_offset(s, a->rd), size, a->rn, a->imm * size);
+}
+
+static void trans_STR_pri(DisasContext *s, arg_rri *a, uint32_t insn)
+{
+int size = pred_full_reg_size(s);
+do_str(s, pred_full_reg_offset(s, a->rd), size, a->rn, a->imm * size);
+}
+
 /*
  *** SVE Memory - Contiguous Load Group
  */
diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index 3e30985a09..5d8e1481d7 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -800,6 +800,12 @@ LD1RQ_zpri 1010010 .. 00 0 001 ... . . \
 
 ### SVE Memory Store Group
 
+# SVE store predicate register
+STR_pri1110010 11 0. . 000 ... . 0 
@pd_rn_i9
+
+# SVE store vector register
+STR_zri1110010 11 0. . 010 ... . . 
@rd_rn_i9
+
 # SVE contiguous store (scalar plus immediate)
 # ST1B, ST1H, ST1W, ST1D; require msz <= esz
 ST_zpri1110010 .. esz:2  0 111 ... . . \
-- 
2.14.3




[Qemu-devel] [PATCH v2 37/67] target/arm: Implement SVE Integer Compare - Immediate Group

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h| 44 +++
 target/arm/sve_helper.c| 88 ++
 target/arm/translate-sve.c | 63 +
 target/arm/sve.decode  | 23 
 4 files changed, 218 insertions(+)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index 6ffd1fbe8e..ae38c0a4be 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -605,6 +605,50 @@ DEF_HELPER_FLAGS_5(sve_cmplo_ppzw_s, TCG_CALL_NO_RWG,
 DEF_HELPER_FLAGS_5(sve_cmpls_ppzw_s, TCG_CALL_NO_RWG,
i32, ptr, ptr, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_4(sve_cmpeq_ppzi_b, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_cmpne_ppzi_b, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_cmpgt_ppzi_b, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_cmpge_ppzi_b, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_cmplt_ppzi_b, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_cmple_ppzi_b, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_cmphs_ppzi_b, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_cmphi_ppzi_b, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_cmplo_ppzi_b, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_cmpls_ppzi_b, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve_cmpeq_ppzi_h, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_cmpne_ppzi_h, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_cmpgt_ppzi_h, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_cmpge_ppzi_h, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_cmplt_ppzi_h, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_cmple_ppzi_h, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_cmphs_ppzi_h, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_cmphi_ppzi_h, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_cmplo_ppzi_h, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_cmpls_ppzi_h, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve_cmpeq_ppzi_s, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_cmpne_ppzi_s, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_cmpgt_ppzi_s, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_cmpge_ppzi_s, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_cmplt_ppzi_s, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_cmple_ppzi_s, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_cmphs_ppzi_s, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_cmphi_ppzi_s, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_cmplo_ppzi_s, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_cmpls_ppzi_s, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve_cmpeq_ppzi_d, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_cmpne_ppzi_d, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_cmpgt_ppzi_d, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_cmpge_ppzi_d, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_cmplt_ppzi_d, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_cmple_ppzi_d, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_cmphs_ppzi_d, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_cmphi_ppzi_d, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_cmplo_ppzi_d, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_cmpls_ppzi_d, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32)
+
 DEF_HELPER_FLAGS_5(sve_and_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
 DEF_HELPER_FLAGS_5(sve_bic_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
 DEF_HELPER_FLAGS_5(sve_eor_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index ae433861f8..b74db681f2 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -2367,3 +2367,91 @@ DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
 #undef DO_CMP_PPZW_H
 #undef DO_CMP_PPZW_S
 #undef DO_CMP_PPZW
+
+/* Similar, but the second source is immediate.  */
+#define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \
+uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)   \
+{\
+intptr_t opr_sz = simd_oprsz(desc);  \
+uint32_t flags = PREDTEST_INIT;  \
+TYPE mm = simd_data(desc);   \
+intptr_t i = opr_sz; \
+do {  

[Qemu-devel] [PATCH v2 31/67] target/arm: Implement SVE conditionally broadcast/extract element

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h|   2 +
 target/arm/sve_helper.c|  11 ++
 target/arm/translate-sve.c | 299 +
 target/arm/sve.decode  |  20 +++
 4 files changed, 332 insertions(+)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index d977aea00d..a58fb4ba01 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -463,6 +463,8 @@ DEF_HELPER_FLAGS_4(sve_trn_d, TCG_CALL_NO_RWG, void, ptr, 
ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(sve_compact_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(sve_compact_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_2(sve_last_active_element, TCG_CALL_NO_RWG, s32, ptr, i32)
+
 DEF_HELPER_FLAGS_5(sve_and_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
 DEF_HELPER_FLAGS_5(sve_bic_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
 DEF_HELPER_FLAGS_5(sve_eor_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index 87a1a32232..ee289be642 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -2050,3 +2050,14 @@ void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, 
uint32_t desc)
 d[j] = 0;
 }
 }
+
+/* Similar to the ARM LastActiveElement pseudocode function, except the
+   result is multiplied by the element size.  This includes the not found
+   indication; e.g. not found for esz=3 is -8.  */
+int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
+{
+intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
+intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
+
+return last_active_element(vg, DIV_ROUND_UP(oprsz, 8), esz);
+}
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index 21531b259c..207a22a0bc 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -2123,6 +2123,305 @@ static void trans_COMPACT(DisasContext *s, arg_rpr_esz 
*a, uint32_t insn)
 do_zpz_ool(s, a, fns[a->esz]);
 }
 
+/* Call the helper that computes the ARM LastActiveElement pseudocode
+   function, scaled by the element size.  This includes the not found
+   indication; e.g. not found for esz=3 is -8.  */
+static void find_last_active(DisasContext *s, TCGv_i32 ret, int esz, int pg)
+{
+/* Predicate sizes may be smaller and cannot use simd_desc.  We cannot
+   round up, as we do elsewhere, because we need the exact size.  */
+TCGv_ptr t_p = tcg_temp_new_ptr();
+TCGv_i32 t_desc;
+unsigned vsz = pred_full_reg_size(s);
+unsigned desc;
+
+desc = vsz - 2;
+desc = deposit32(desc, SIMD_DATA_SHIFT, 2, esz);
+
+tcg_gen_addi_ptr(t_p, cpu_env, pred_full_reg_offset(s, pg));
+t_desc = tcg_const_i32(desc);
+
+gen_helper_sve_last_active_element(ret, t_p, t_desc);
+
+tcg_temp_free_i32(t_desc);
+tcg_temp_free_ptr(t_p);
+}
+
+/* Increment LAST to the offset of the next element in the vector,
+   wrapping around to 0.  */
+static void incr_last_active(DisasContext *s, TCGv_i32 last, int esz)
+{
+unsigned vsz = vec_full_reg_size(s);
+
+tcg_gen_addi_i32(last, last, 1 << esz);
+if (is_power_of_2(vsz)) {
+tcg_gen_andi_i32(last, last, vsz - 1);
+} else {
+TCGv_i32 max = tcg_const_i32(vsz);
+TCGv_i32 zero = tcg_const_i32(0);
+tcg_gen_movcond_i32(TCG_COND_GEU, last, last, max, zero, last);
+tcg_temp_free_i32(max);
+tcg_temp_free_i32(zero);
+}
+}
+
+/* If LAST < 0, set LAST to the offset of the last element in the vector.  */
+static void wrap_last_active(DisasContext *s, TCGv_i32 last, int esz)
+{
+unsigned vsz = vec_full_reg_size(s);
+
+if (is_power_of_2(vsz)) {
+tcg_gen_andi_i32(last, last, vsz - 1);
+} else {
+TCGv_i32 max = tcg_const_i32(vsz - (1 << esz));
+TCGv_i32 zero = tcg_const_i32(0);
+tcg_gen_movcond_i32(TCG_COND_LT, last, last, zero, max, last);
+tcg_temp_free_i32(max);
+tcg_temp_free_i32(zero);
+}
+}
+
+/* Load an unsigned element of ESZ from BASE+OFS.  */
+static TCGv_i64 load_esz(TCGv_ptr base, int ofs, int esz)
+{
+TCGv_i64 r = tcg_temp_new_i64();
+
+switch (esz) {
+case 0:
+tcg_gen_ld8u_i64(r, base, ofs);
+break;
+case 1:
+tcg_gen_ld16u_i64(r, base, ofs);
+break;
+case 2:
+tcg_gen_ld32u_i64(r, base, ofs);
+break;
+case 3:
+tcg_gen_ld_i64(r, base, ofs);
+break;
+default:
+g_assert_not_reached();
+}
+return r;
+}
+
+/* Load an unsigned element of ESZ from RM[LAST].  */
+static TCGv_i64 load_last_active(DisasContext *s, TCGv_i32 last,
+ int rm, int esz)
+{
+TCGv_ptr p = tcg_temp_new_ptr();
+TCGv_i64 r;
+
+/* Convert offset into vector into offset into ENV.
+   The final adjustment for the vector register base
+   is added via constant offset to the load.  

[Qemu-devel] [PATCH v2 59/67] target/arm: Implement SVE Floating Point Multiply Indexed Group

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper.h| 14 ++
 target/arm/translate-sve.c | 44 +++
 target/arm/vec_helper.c| 64 ++
 target/arm/sve.decode  | 19 ++
 4 files changed, 141 insertions(+)

diff --git a/target/arm/helper.h b/target/arm/helper.h
index f3ce58e276..a8d824b085 100644
--- a/target/arm/helper.h
+++ b/target/arm/helper.h
@@ -584,6 +584,20 @@ DEF_HELPER_FLAGS_5(gvec_ftsmul_s, TCG_CALL_NO_RWG,
 DEF_HELPER_FLAGS_5(gvec_ftsmul_d, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_5(gvec_fmul_idx_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_fmul_idx_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_fmul_idx_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_6(gvec_fmla_idx_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(gvec_fmla_idx_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(gvec_fmla_idx_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+
 #ifdef TARGET_AARCH64
 #include "helper-a64.h"
 #include "helper-sve.h"
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index 6ce1b01b9a..cf2a4d3284 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -3136,6 +3136,50 @@ DO_ZZI(UMIN, umin)
 
 #undef DO_ZZI
 
+/*
+ *** SVE Floating Point Multiply-Add Indexed Group
+ */
+
+static void trans_FMLA_zzxz(DisasContext *s, arg_FMLA_zzxz *a, uint32_t insn)
+{
+static gen_helper_gvec_4_ptr * const fns[3] = {
+gen_helper_gvec_fmla_idx_h,
+gen_helper_gvec_fmla_idx_s,
+gen_helper_gvec_fmla_idx_d,
+};
+unsigned vsz = vec_full_reg_size(s);
+TCGv_ptr status = get_fpstatus_ptr(a->esz == MO_16);
+
+tcg_gen_gvec_4_ptr(vec_full_reg_offset(s, a->rd),
+   vec_full_reg_offset(s, a->rn),
+   vec_full_reg_offset(s, a->rm),
+   vec_full_reg_offset(s, a->ra),
+   status, vsz, vsz, a->index * 2 + a->sub,
+   fns[a->esz - 1]);
+tcg_temp_free_ptr(status);
+}
+
+/*
+ *** SVE Floating Point Multiply Indexed Group
+ */
+
+static void trans_FMUL_zzx(DisasContext *s, arg_FMUL_zzx *a, uint32_t insn)
+{
+static gen_helper_gvec_3_ptr * const fns[3] = {
+gen_helper_gvec_fmul_idx_h,
+gen_helper_gvec_fmul_idx_s,
+gen_helper_gvec_fmul_idx_d,
+};
+unsigned vsz = vec_full_reg_size(s);
+TCGv_ptr status = get_fpstatus_ptr(a->esz == MO_16);
+
+tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, a->rd),
+   vec_full_reg_offset(s, a->rn),
+   vec_full_reg_offset(s, a->rm),
+   status, vsz, vsz, a->index, fns[a->esz - 1]);
+tcg_temp_free_ptr(status);
+}
+
 /*
  *** SVE Floating Point Accumulating Reduction Group
  */
diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
index ad5c29cdd5..e711a3217d 100644
--- a/target/arm/vec_helper.c
+++ b/target/arm/vec_helper.c
@@ -24,6 +24,22 @@
 #include "fpu/softfloat.h"
 
 
+/* Note that vector data is stored in host-endian 64-bit chunks,
+   so addressing units smaller than that needs a host-endian fixup.  */
+#ifdef HOST_WORDS_BIGENDIAN
+#define H1(x)   ((x) ^ 7)
+#define H1_2(x) ((x) ^ 6)
+#define H1_4(x) ((x) ^ 4)
+#define H2(x)   ((x) ^ 3)
+#define H4(x)   ((x) ^ 1)
+#else
+#define H1(x)   (x)
+#define H1_2(x) (x)
+#define H1_4(x) (x)
+#define H2(x)   (x)
+#define H4(x)   (x)
+#endif
+
 /* Floating-point trigonometric starting value.
  * See the ARM ARM pseudocode function FPTrigSMul.
  */
@@ -92,3 +108,51 @@ DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
 
 #endif
 #undef DO_3OP
+
+/* For the indexed ops, SVE applies the index per 128-bit vector segment.
+ * For AdvSIMD, there is of course only one such vector segment.
+ */
+
+#define DO_MUL_IDX(NAME, TYPE, H) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
+{  \
+intptr_t i, j, oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE);  \
+intptr_t idx = simd_data(desc);\
+TYPE *d = vd, *n = vn, *m = vm;\
+for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {  \
+TYPE mm = m[H(i + idx)];   \
+for (j = 0; j < segment; j++) {\
+d[i + j] = TYPE##_mul(n[i + j], mm, stat); \
+}  \
+}  \

[Qemu-devel] [PATCH v2 54/67] target/arm: Implement SVE prefetches

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/translate-sve.c |  9 +
 target/arm/sve.decode  | 23 +++
 2 files changed, 32 insertions(+)

diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index ca49b94924..63c7a0e8d8 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -3958,3 +3958,12 @@ static void trans_ST1_zprz(DisasContext *s, arg_ST1_zprz 
*a, uint32_t insn)
 do_mem_zpz(s, a->rd, a->pg, a->rm, a->scale * a->msz,
cpu_reg_sp(s, a->rn), fn);
 }
+
+/*
+ * Prefetches
+ */
+
+static void trans_PRF(DisasContext *s, arg_PRF *a, uint32_t insn)
+{
+/* Prefetch is a nop within QEMU.  */
+}
diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index edd9340c02..f0144aa2d0 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -801,6 +801,29 @@ LD1RQ_zprr 1010010 .. 00 . 000 ... . . \
 LD1RQ_zpri 1010010 .. 00 0 001 ... . . \
@rpri_load_msz nreg=0
 
+# SVE 32-bit gather prefetch (scalar plus 32-bit scaled offsets)
+PRF110 00 -1 - 0-- --- - 0 
+
+# SVE 32-bit gather prefetch (vector plus immediate)
+PRF110 -- 00 - 111 --- - 0 
+
+# SVE contiguous prefetch (scalar plus immediate)
+PRF110 11 1- - 0-- --- - 0 
+
+# SVE contiguous prefetch (scalar plus scalar)
+PRF110 -- 00 - 110 --- - 0 
+
+### SVE Memory 64-bit Gather Group
+
+# SVE 64-bit gather prefetch (scalar plus 64-bit scaled offsets)
+PRF1100010 00 11 - 1-- --- - 0 
+
+# SVE 64-bit gather prefetch (scalar plus unpacked 32-bit scaled offsets)
+PRF1100010 00 -1 - 0-- --- - 0 
+
+# SVE 64-bit gather prefetch (vector plus immediate)
+PRF1100010 -- 00 - 111 --- - 0 
+
 ### SVE Memory Store Group
 
 # SVE store predicate register
-- 
2.14.3




[Qemu-devel] [PATCH v2 57/67] target/arm: Implement SVE floating-point compare vectors

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h| 49 +++
 target/arm/sve_helper.c| 64 ++
 target/arm/translate-sve.c | 41 +
 target/arm/sve.decode  | 11 
 4 files changed, 165 insertions(+)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index 3cb7ab9ef2..30373e3fc7 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -839,6 +839,55 @@ DEF_HELPER_FLAGS_5(sve_ucvt_ds, TCG_CALL_NO_RWG,
 DEF_HELPER_FLAGS_5(sve_ucvt_dd, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_6(sve_fcmge_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fcmge_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fcmge_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_6(sve_fcmgt_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fcmgt_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fcmgt_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_6(sve_fcmeq_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fcmeq_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fcmeq_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_6(sve_fcmne_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fcmne_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fcmne_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_6(sve_fcmuo_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fcmuo_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fcmuo_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_6(sve_facge_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_facge_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_facge_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_6(sve_facgt_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_facgt_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_facgt_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, ptr, i32)
+
 DEF_HELPER_FLAGS_3(sve_fmla_zpzzz_h, TCG_CALL_NO_RWG, void, env, ptr, i32)
 DEF_HELPER_FLAGS_3(sve_fmla_zpzzz_s, TCG_CALL_NO_RWG, void, env, ptr, i32)
 DEF_HELPER_FLAGS_3(sve_fmla_zpzzz_d, TCG_CALL_NO_RWG, void, env, ptr, i32)
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index 4edd3d4367..ace613684d 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -3100,6 +3100,70 @@ DO_FMLA(sve_fnmls_zpzzz_d, 64, , 1, 1)
 
 #undef DO_FMLA
 
+/* Two operand floating-point comparison controlled by a predicate.
+ * Unlike the integer version, we are not allowed to optimistically
+ * compare operands, since the comparison may have side effects wrt
+ * the FPSR.
+ */
+#define DO_FPCMP_PPZZ(NAME, TYPE, H, OP)\
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,   \
+  void *status, uint32_t desc)  \
+{   \
+intptr_t opr_sz = simd_oprsz(desc); \
+intptr_t i = opr_sz, j = ((opr_sz - 1) & -64) >> 3; \
+do {\
+uint64_t out = 0;   \
+uint64_t pg = *(uint64_t *)(vg + j);\
+do {\
+i -= sizeof(TYPE), out <<= sizeof(TYPE);\
+if ((pg >> (i & 63)) & 1) { \
+TYPE nn = *(TYPE *)(vn + H(i)); \
+TYPE mm = *(TYPE *)(vm + H(i)); \
+out |= OP(TYPE, nn, mm, status);\
+}   \
+} while (i & 63);   \
+*(uint64_t *)(vd + j) = out;\
+j -= 8; 

[Qemu-devel] [PATCH v2 40/67] target/arm: Implement SVE Integer Compare - Scalars Group

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h|  2 +
 target/arm/sve_helper.c| 31 
 target/arm/translate-sve.c | 92 ++
 target/arm/sve.decode  |  8 
 4 files changed, 133 insertions(+)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index dd4f8f754d..1863106d0f 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -678,3 +678,5 @@ DEF_HELPER_FLAGS_4(sve_brkn, TCG_CALL_NO_RWG, void, ptr, 
ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(sve_brkns, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32)
 
 DEF_HELPER_FLAGS_3(sve_cntp, TCG_CALL_NO_RWG, i64, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(sve_while, TCG_CALL_NO_RWG, i32, ptr, i32, i32)
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index dd884bdd1c..80b78da834 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -2716,3 +2716,34 @@ uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t 
pred_desc)
 }
 return sum;
 }
+
+uint32_t HELPER(sve_while)(void *vd, uint32_t count, uint32_t pred_desc)
+{
+uintptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
+intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
+uint64_t esz_mask = pred_esz_masks[esz];
+ARMPredicateReg *d = vd;
+uint32_t flags;
+intptr_t i;
+
+/* Begin with a zero predicate register.  */
+flags = do_zero(d, oprsz);
+if (count == 0) {
+return flags;
+}
+
+/* Scale from predicate element count to bits.  */
+count <<= esz;
+/* Bound to the bits in the predicate.  */
+count = MIN(count, oprsz * 8);
+
+/* Set all of the requested bits.  */
+for (i = 0; i < count / 64; ++i) {
+d->p[i] = esz_mask;
+}
+if (count & 63) {
+d->p[i] = ~(-1ull << (count & 63)) & esz_mask;
+}
+
+return predtest_ones(d, oprsz, esz_mask);
+}
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index 038800cc86..4b92a55c21 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -2847,6 +2847,98 @@ static void trans_SINCDECP_z(DisasContext *s, 
arg_incdec2_pred *a,
 do_sat_addsub_vec(s, a->esz, a->rd, a->rn, val, a->u, a->d);
 }
 
+/*
+ *** SVE Integer Compare Scalars Group
+ */
+
+static void trans_CTERM(DisasContext *s, arg_CTERM *a, uint32_t insn)
+{
+TCGCond cond = (a->ne ? TCG_COND_NE : TCG_COND_EQ);
+TCGv_i64 rn = read_cpu_reg(s, a->rn, a->sf);
+TCGv_i64 rm = read_cpu_reg(s, a->rm, a->sf);
+TCGv_i64 cmp = tcg_temp_new_i64();
+
+tcg_gen_setcond_i64(cond, cmp, rn, rm);
+tcg_gen_extrl_i64_i32(cpu_NF, cmp);
+tcg_temp_free_i64(cmp);
+
+/* VF = !NF & !CF.  */
+tcg_gen_xori_i32(cpu_VF, cpu_NF, 1);
+tcg_gen_andc_i32(cpu_VF, cpu_VF, cpu_CF);
+
+/* Both NF and VF actually look at bit 31.  */
+tcg_gen_neg_i32(cpu_NF, cpu_NF);
+tcg_gen_neg_i32(cpu_VF, cpu_VF);
+}
+
+static void trans_WHILE(DisasContext *s, arg_WHILE *a, uint32_t insn)
+{
+TCGv_i64 op0 = read_cpu_reg(s, a->rn, 1);
+TCGv_i64 op1 = read_cpu_reg(s, a->rm, 1);
+TCGv_i64 t0 = tcg_temp_new_i64();
+TCGv_i64 t1 = tcg_temp_new_i64();
+TCGv_i32 t2, t3;
+TCGv_ptr ptr;
+unsigned desc, vsz = vec_full_reg_size(s);
+TCGCond cond;
+
+if (!a->sf) {
+if (a->u) {
+tcg_gen_ext32u_i64(op0, op0);
+tcg_gen_ext32u_i64(op1, op1);
+} else {
+tcg_gen_ext32s_i64(op0, op0);
+tcg_gen_ext32s_i64(op1, op1);
+}
+}
+
+/* For the helper, compress the different conditions into a computation
+ * of how many iterations for which the condition is true.
+ *
+ * This is slightly complicated by 0 <= UINT64_MAX, which is nominally
+ * 2**64 iterations, overflowing to 0.  Of course, predicate registers
+ * aren't that large, so any value >= predicate size is sufficient.
+ */
+tcg_gen_sub_i64(t0, op1, op0);
+
+/* t0 = MIN(op1 - op0, vsz).  */
+if (a->eq) {
+/* Equality means one more iteration.  */
+tcg_gen_movi_i64(t1, vsz - 1);
+tcg_gen_movcond_i64(TCG_COND_LTU, t0, t0, t1, t0, t1);
+tcg_gen_addi_i64(t0, t0, 1);
+} else {
+tcg_gen_movi_i64(t1, vsz);
+tcg_gen_movcond_i64(TCG_COND_LTU, t0, t0, t1, t0, t1);
+}
+
+/* t0 = (condition true ? t0 : 0).  */
+cond = (a->u
+? (a->eq ? TCG_COND_LEU : TCG_COND_LTU)
+: (a->eq ? TCG_COND_LE : TCG_COND_LT));
+tcg_gen_movi_i64(t1, 0);
+tcg_gen_movcond_i64(cond, t0, op0, op1, t0, t1);
+
+t2 = tcg_temp_new_i32();
+tcg_gen_extrl_i64_i32(t2, t0);
+tcg_temp_free_i64(t0);
+tcg_temp_free_i64(t1);
+
+desc = (vsz / 8) - 2;
+desc = deposit32(desc, SIMD_DATA_SHIFT, 2, a->esz);
+t3 = tcg_const_i32(desc);
+
+ptr = tcg_temp_new_ptr();
+tcg_gen_addi_ptr(ptr, cpu_env, pred_full_reg_offset(s, a->rd));
+
+gen_helper_sve_while(t2, ptr, t2, t3);
+do_pred_flags(t2);
+
+tc

[Qemu-devel] [PATCH v2 64/67] target/arm: Implement SVE floating-point convert precision

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h| 13 +
 target/arm/sve_helper.c| 27 +++
 target/arm/translate-sve.c | 30 ++
 target/arm/sve.decode  |  8 
 4 files changed, 78 insertions(+)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index ce5fe24dc2..bac4bfdc60 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -942,6 +942,19 @@ DEF_HELPER_FLAGS_6(sve_fmins_s, TCG_CALL_NO_RWG,
 DEF_HELPER_FLAGS_6(sve_fmins_d, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, i64, ptr, i32)
 
+DEF_HELPER_FLAGS_5(sve_fcvt_sh, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcvt_dh, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcvt_hs, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcvt_ds, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcvt_hd, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcvt_sd, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
 DEF_HELPER_FLAGS_5(sve_scvt_hh, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_5(sve_scvt_sh, TCG_CALL_NO_RWG,
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index 53e3516f47..9db01ac2f2 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -3157,6 +3157,33 @@ void HELPER(NAME)(void *vd, void *vn, void *vg, void 
*status, uint32_t desc) \
 }   \
 }
 
+static inline float32 float16_to_float32_ieee(float16 f, float_status *s)
+{
+return float16_to_float32(f, true, s);
+}
+
+static inline float64 float16_to_float64_ieee(float16 f, float_status *s)
+{
+return float16_to_float64(f, true, s);
+}
+
+static inline float16 float32_to_float16_ieee(float32 f, float_status *s)
+{
+return float32_to_float16(f, true, s);
+}
+
+static inline float16 float64_to_float16_ieee(float64 f, float_status *s)
+{
+return float64_to_float16(f, true, s);
+}
+
+DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, float32_to_float16_ieee)
+DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, float16_to_float32_ieee)
+DO_ZPZ_FP_D(sve_fcvt_dh, uint64_t, float64_to_float16_ieee)
+DO_ZPZ_FP_D(sve_fcvt_hd, uint64_t, float16_to_float64_ieee)
+DO_ZPZ_FP_D(sve_fcvt_ds, uint64_t, float64_to_float32)
+DO_ZPZ_FP_D(sve_fcvt_sd, uint64_t, float32_to_float64)
+
 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index e185af29e3..361d545965 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -3651,6 +3651,36 @@ static void do_zpz_ptr(DisasContext *s, int rd, int rn, 
int pg,
 tcg_temp_free_ptr(status);
 }
 
+static void trans_FCVT_sh(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+do_zpz_ptr(s, a->rd, a->rn, a->pg, true, gen_helper_sve_fcvt_sh);
+}
+
+static void trans_FCVT_hs(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_fcvt_hs);
+}
+
+static void trans_FCVT_dh(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+do_zpz_ptr(s, a->rd, a->rn, a->pg, true, gen_helper_sve_fcvt_dh);
+}
+
+static void trans_FCVT_hd(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_fcvt_hd);
+}
+
+static void trans_FCVT_ds(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_fcvt_ds);
+}
+
+static void trans_FCVT_sd(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_fcvt_sd);
+}
+
 static void trans_SCVTF_hh(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
 {
 do_zpz_ptr(s, a->rd, a->rn, a->pg, true, gen_helper_sve_scvt_hh);
diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index ca54895900..d44cf17fc8 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -824,6 +824,14 @@ FNMLS_zpzzz01100101 .. 1 . 111 ... . . 
@rdn_pg_rm_ra
 
 ### SVE FP Unary Operations Predicated Group
 
+# SVE floating-point convert precision
+FCVT_sh01100101 10 0010 00 101 ... . . 
@rd_pg_rn_e0
+FCVT_hs01100101 10 0010 01 101 ... . . 
@rd_pg_rn_e0
+FCVT_dh01100101 11 0010 00 101 ... . . 
@rd_pg_rn_e0
+FCVT_hd01100101 11 0010 01 101 ... . . 
@rd_pg_rn_e0
+FCVT_ds01100101 11 0010 10 101 ... . . 
@rd_pg_rn_e0
+FCVT_sd01100101 11 0010 11 101 ... . . 
@rd_pg_rn_e0

[Qemu-devel] [PATCH v2 38/67] target/arm: Implement SVE Partition Break Group

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h|  18 
 target/arm/sve_helper.c| 247 +
 target/arm/translate-sve.c |  96 ++
 target/arm/sve.decode  |  19 
 4 files changed, 380 insertions(+)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index ae38c0a4be..f0a3ed3414 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -658,3 +658,21 @@ DEF_HELPER_FLAGS_5(sve_orn_, TCG_CALL_NO_RWG, void, 
ptr, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_5(sve_nor_, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, 
i32)
 DEF_HELPER_FLAGS_5(sve_nand_, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve_brkpa, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_brkpb, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_brkpas, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_brkpbs, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve_brka_z, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_brkb_z, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_brka_m, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_brkb_m, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve_brkas_z, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_brkbs_z, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_brkas_m, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_brkbs_m, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve_brkn, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_brkns, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32)
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index b74db681f2..d6d2220f8b 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -2455,3 +2455,250 @@ DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
 #undef DO_CMP_PPZI_S
 #undef DO_CMP_PPZI_D
 #undef DO_CMP_PPZI
+
+/* Similar to the ARM LastActive pseudocode function.  */
+static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
+{
+intptr_t i;
+
+for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
+uint64_t pg = *(uint64_t *)(vg + i);
+if (pg) {
+return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
+}
+}
+return 0;
+}
+
+/* Compute a mask into RETB that is true for all G, up to and including
+ * (if after) or excluding (if !after) the first G & N.
+ * Return true if BRK found.
+ */
+static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
+bool brk, bool after)
+{
+uint64_t b;
+
+if (brk) {
+b = 0;
+} else if ((g & n) == 0) {
+/* For all G, no N are set; break not found.  */
+b = g;
+} else {
+/* Break somewhere in N.  Locate it.  */
+b = g & n;/* guard true, pred true*/
+b = b & -b;   /* first such */
+if (after) {
+b = b | (b - 1);  /* break after same */
+} else {
+b = b - 1;/* break before same */
+}
+brk = true;
+}
+
+*retb = b;
+return brk;
+}
+
+/* Compute a zeroing BRK.  */
+static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
+  intptr_t oprsz, bool after)
+{
+bool brk = false;
+intptr_t i;
+
+for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
+uint64_t this_b, this_g = g[i];
+
+brk = compute_brk(&this_b, n[i], this_g, brk, after);
+d[i] = this_b & this_g;
+}
+}
+
+/* Likewise, but also compute flags.  */
+static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
+   intptr_t oprsz, bool after)
+{
+uint32_t flags = PREDTEST_INIT;
+bool brk = false;
+intptr_t i;
+
+for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
+uint64_t this_b, this_d, this_g = g[i];
+
+brk = compute_brk(&this_b, n[i], this_g, brk, after);
+d[i] = this_d = this_b & this_g;
+flags = iter_predtest_fwd(this_d, this_g, flags);
+}
+return flags;
+}
+
+/* Given a computation function, compute a merging BRK.  */
+static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
+  intptr_t oprsz, bool after)
+{
+bool brk = false;
+intptr_t i;
+
+for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
+uint64_t this_b, this_g = g[i];
+
+brk = compute_brk(&this_b, n[i], this_g, brk, after);
+d[i] = (this_b & this_g) | (d[i] & ~this_g);
+}
+}
+
+/* Likewise, but also compute flags.  */
+static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
+   intptr_t oprsz, bool after)
+{
+uint32_t flags = PREDTEST_INIT;
+bool brk = false;
+intptr_t i;
+
+for (i = 0; i < oprsz / 

[Qemu-devel] [PATCH v2 67/67] target/arm: Implement SVE floating-point unary operations

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h| 14 ++
 target/arm/sve_helper.c|  8 
 target/arm/translate-sve.c | 28 
 target/arm/sve.decode  |  4 
 4 files changed, 54 insertions(+)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index 749bab0b38..5cebc9121d 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -999,6 +999,20 @@ DEF_HELPER_FLAGS_5(sve_frintx_s, TCG_CALL_NO_RWG,
 DEF_HELPER_FLAGS_5(sve_frintx_d, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_5(sve_frecpx_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_frecpx_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_frecpx_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve_fsqrt_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fsqrt_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fsqrt_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
 DEF_HELPER_FLAGS_5(sve_scvt_hh, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_5(sve_scvt_sh, TCG_CALL_NO_RWG,
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index 7950710be7..4f0985a29e 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -3208,6 +3208,14 @@ DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, 
float16_round_to_int)
 DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
 DO_ZPZ_FP_D(sve_frintx_d, uint64_t, float64_round_to_int)
 
+DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
+DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
+DO_ZPZ_FP_D(sve_frecpx_d, uint64_t, helper_frecpx_f64)
+
+DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
+DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
+DO_ZPZ_FP_D(sve_fsqrt_d, uint64_t, float64_sqrt)
+
 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index 5f1c4984b8..f1ff03 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -3831,6 +3831,34 @@ static void trans_FRINTA(DisasContext *s, arg_rpr_esz 
*a, uint32_t insn)
 do_frint_mode(s, a, float_round_ties_away);
 }
 
+static void trans_FRECPX(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+static gen_helper_gvec_3_ptr * const fns[3] = {
+gen_helper_sve_frecpx_h,
+gen_helper_sve_frecpx_s,
+gen_helper_sve_frecpx_d
+};
+if (a->esz == 0) {
+unallocated_encoding(s);
+} else {
+do_zpz_ptr(s, a->rd, a->rn, a->pg, a->esz == MO_16, fns[a->esz - 1]);
+}
+}
+
+static void trans_FSQRT(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+static gen_helper_gvec_3_ptr * const fns[3] = {
+gen_helper_sve_fsqrt_h,
+gen_helper_sve_fsqrt_s,
+gen_helper_sve_fsqrt_d
+};
+if (a->esz == 0) {
+unallocated_encoding(s);
+} else {
+do_zpz_ptr(s, a->rd, a->rn, a->pg, a->esz == MO_16, fns[a->esz - 1]);
+}
+}
+
 static void trans_SCVTF_hh(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
 {
 do_zpz_ptr(s, a->rd, a->rn, a->pg, true, gen_helper_sve_scvt_hh);
diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index e06c0c5279..fbd9cf1384 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -857,6 +857,10 @@ FRINTA 01100101 .. 000 100 101 ... . . 
@rd_pg_rn
 FRINTX 01100101 .. 000 110 101 ... . . @rd_pg_rn
 FRINTI 01100101 .. 000 111 101 ... . . @rd_pg_rn
 
+# SVE floating-point unary operations
+FRECPX 01100101 .. 001 100 101 ... . . @rd_pg_rn
+FSQRT  01100101 .. 001 101 101 ... . . @rd_pg_rn
+
 # SVE integer convert to floating-point
 SCVTF_hh   01100101 01 010 01 0 101 ... . .@rd_pg_rn_e0
 SCVTF_sh   01100101 01 010 10 0 101 ... . .@rd_pg_rn_e0
-- 
2.14.3




[Qemu-devel] [PATCH v2 61/67] target/arm: Implement SVE Floating Point Unary Operations - Unpredicated Group

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper.h|  8 
 target/arm/translate-sve.c | 43 +++
 target/arm/vec_helper.c| 20 
 target/arm/sve.decode  |  5 +
 4 files changed, 76 insertions(+)

diff --git a/target/arm/helper.h b/target/arm/helper.h
index a8d824b085..4bfefe42b2 100644
--- a/target/arm/helper.h
+++ b/target/arm/helper.h
@@ -565,6 +565,14 @@ DEF_HELPER_2(dc_zva, void, env, i64)
 DEF_HELPER_FLAGS_2(neon_pmull_64_lo, TCG_CALL_NO_RWG_SE, i64, i64, i64)
 DEF_HELPER_FLAGS_2(neon_pmull_64_hi, TCG_CALL_NO_RWG_SE, i64, i64, i64)
 
+DEF_HELPER_FLAGS_4(gvec_frecpe_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_frecpe_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_frecpe_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(gvec_frsqrte_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_frsqrte_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_frsqrte_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
 DEF_HELPER_FLAGS_5(gvec_fadd_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_5(gvec_fadd_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_5(gvec_fadd_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index a77ddf0f4b..463ff7b690 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -3235,6 +3235,49 @@ DO_VPZ(FMAXNMV, fmaxnmv)
 DO_VPZ(FMINV, fminv)
 DO_VPZ(FMAXV, fmaxv)
 
+/*
+ *** SVE Floating Point Unary Operations - Unpredicated Group
+ */
+
+static void do_zz_fp(DisasContext *s, arg_rr_esz *a, gen_helper_gvec_2_ptr *fn)
+{
+unsigned vsz = vec_full_reg_size(s);
+TCGv_ptr status = get_fpstatus_ptr(a->esz == MO_16);
+
+tcg_gen_gvec_2_ptr(vec_full_reg_offset(s, a->rd),
+   vec_full_reg_offset(s, a->rn),
+   status, vsz, vsz, 0, fn);
+tcg_temp_free_ptr(status);
+}
+
+static void trans_FRECPE(DisasContext *s, arg_rr_esz *a, uint32_t insn)
+{
+static gen_helper_gvec_2_ptr * const fns[3] = {
+gen_helper_gvec_frecpe_h,
+gen_helper_gvec_frecpe_s,
+gen_helper_gvec_frecpe_d,
+};
+if (a->esz == 0) {
+unallocated_encoding(s);
+} else {
+do_zz_fp(s, a, fns[a->esz - 1]);
+}
+}
+
+static void trans_FRSQRTE(DisasContext *s, arg_rr_esz *a, uint32_t insn)
+{
+static gen_helper_gvec_2_ptr * const fns[3] = {
+gen_helper_gvec_frsqrte_h,
+gen_helper_gvec_frsqrte_s,
+gen_helper_gvec_frsqrte_d,
+};
+if (a->esz == 0) {
+unallocated_encoding(s);
+} else {
+do_zz_fp(s, a, fns[a->esz - 1]);
+}
+}
+
 /*
  *** SVE Floating Point Accumulating Reduction Group
  */
diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
index e711a3217d..60dc07cf87 100644
--- a/target/arm/vec_helper.c
+++ b/target/arm/vec_helper.c
@@ -40,6 +40,26 @@
 #define H4(x)   (x)
 #endif
 
+#define DO_2OP(NAME, FUNC, TYPE) \
+void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)  \
+{ \
+intptr_t i, oprsz = simd_oprsz(desc); \
+TYPE *d = vd, *n = vn;\
+for (i = 0; i < oprsz / sizeof(TYPE); i++) {  \
+d[i] = FUNC(n[i], stat);  \
+} \
+}
+
+DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16)
+DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32)
+DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64)
+
+DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
+DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
+DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
+
+#undef DO_2OP
+
 /* Floating-point trigonometric starting value.
  * See the ARM ARM pseudocode function FPTrigSMul.
  */
diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index feb8c65e89..112e85174c 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -747,6 +747,11 @@ FMINNMV01100101 .. 000 101 001 ... . . 
@rd_pg_rn
 FMAXV  01100101 .. 000 110 001 ... . . @rd_pg_rn
 FMINV  01100101 .. 000 111 001 ... . . @rd_pg_rn
 
+## SVE Floating Point Unary Operations - Unpredicated Group
+
+FRECPE 01100101 .. 001 110 001110 . .  @rd_rn
+FRSQRTE01100101 .. 001 111 001110 . .  @rd_rn
+
 ### SVE FP Accumulating Reduction Group
 
 # SVE floating-point serial reduction (predicated)
-- 
2.14.3




[Qemu-devel] [PATCH v2 45/67] target/arm: Implement SVE Memory Contiguous Store Group

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h|  29 +++
 target/arm/sve_helper.c| 211 +
 target/arm/translate-sve.c |  68 ++-
 target/arm/sve.decode  |  38 
 4 files changed, 343 insertions(+), 3 deletions(-)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index fcc9ba5f50..74c2d642a3 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -754,3 +754,32 @@ DEF_HELPER_FLAGS_4(sve_ld1hds_r, TCG_CALL_NO_WG, void, 
env, ptr, tl, i32)
 
 DEF_HELPER_FLAGS_4(sve_ld1sdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
 DEF_HELPER_FLAGS_4(sve_ld1sds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_4(sve_st1bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_st2bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_st3bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_st4bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_4(sve_st1hh_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_st2hh_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_st3hh_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_st4hh_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_4(sve_st1ss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_st2ss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_st3ss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_st4ss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_4(sve_st1dd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_st2dd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_st3dd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_st4dd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_4(sve_st1bh_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_st1bs_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_st1bd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_4(sve_st1hs_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_st1hd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_4(sve_st1sd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index e542725113..e259e910de 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -3023,3 +3023,214 @@ void HELPER(sve_ld4dd_r)(CPUARMState *env, void *vg,
 addr += 4 * 8;
 }
 }
+
+/*
+ * Store contiguous data, protected by a governing predicate.
+ */
+#define DO_ST1(NAME, FN, TYPEE, TYPEM, H)  \
+void HELPER(NAME)(CPUARMState *env, void *vg,  \
+  target_ulong addr, uint32_t desc)\
+{  \
+intptr_t i, oprsz = simd_oprsz(desc);  \
+intptr_t ra = GETPC(); \
+unsigned rd = simd_data(desc); \
+void *vd = &env->vfp.zregs[rd];\
+for (i = 0; i < oprsz; ) { \
+uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));\
+do {   \
+if (pg & 1) {  \
+TYPEM m = *(TYPEE *)(vd + H(i));   \
+FN(env, addr, m, ra);  \
+}  \
+i += sizeof(TYPEE), pg >>= sizeof(TYPEE);  \
+addr += sizeof(TYPEM); \
+} while (i & 15);  \
+}  \
+}
+
+#define DO_ST1_D(NAME, FN, TYPEM)  \
+void HELPER(NAME)(CPUARMState *env, void *vg,  \
+  target_ulong addr, uint32_t desc)\
+{  \
+intptr_t i, oprsz = simd_oprsz(desc) / 8;  \
+intptr_t ra = GETPC(); \
+unsigned rd = simd_data(desc); \
+uint64_t *d = &env->vfp.zregs[rd].d[0];\
+uint8_t *pg = vg;  \
+for (i = 0; i < oprsz; i += 1) {   \
+if (pg[H1(i)] & 1) {   \
+FN(env, addr, d[i], ra);   \
+}  \
+addr += sizeof(TYPEM); \
+}  \
+}
+
+#define DO_ST2(NAME, FN, TYPEE, TYPEM, H)  \
+void HELPER(NAME)(CPUARMState *env, void *vg,  \
+  target_ulong addr, uint32_t desc)  

[Qemu-devel] [PATCH v2 42/67] target/arm: Implement SVE Integer Wide Immediate - Unpredicated Group

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h|  25 +
 target/arm/sve_helper.c|  41 ++
 target/arm/translate-sve.c | 135 +
 target/arm/sve.decode  |  26 +
 4 files changed, 227 insertions(+)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index 1863106d0f..97bfe0f47b 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -680,3 +680,28 @@ DEF_HELPER_FLAGS_4(sve_brkns, TCG_CALL_NO_RWG, i32, ptr, 
ptr, ptr, i32)
 DEF_HELPER_FLAGS_3(sve_cntp, TCG_CALL_NO_RWG, i64, ptr, ptr, i32)
 
 DEF_HELPER_FLAGS_3(sve_while, TCG_CALL_NO_RWG, i32, ptr, i32, i32)
+
+DEF_HELPER_FLAGS_4(sve_subri_b, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(sve_subri_h, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(sve_subri_s, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(sve_subri_d, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+
+DEF_HELPER_FLAGS_4(sve_smaxi_b, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(sve_smaxi_h, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(sve_smaxi_s, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(sve_smaxi_d, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+
+DEF_HELPER_FLAGS_4(sve_smini_b, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(sve_smini_h, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(sve_smini_s, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(sve_smini_d, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+
+DEF_HELPER_FLAGS_4(sve_umaxi_b, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(sve_umaxi_h, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(sve_umaxi_s, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(sve_umaxi_d, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+
+DEF_HELPER_FLAGS_4(sve_umini_b, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(sve_umini_h, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(sve_umini_s, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(sve_umini_d, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index 80b78da834..4f45f11bff 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -803,6 +803,46 @@ DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
 #undef DO_VPZ
 #undef DO_VPZ_D
 
+/* Two vector operand, one scalar operand, unpredicated.  */
+#define DO_ZZI(NAME, TYPE, OP)   \
+void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc)   \
+{\
+intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE);\
+TYPE s = s64, *d = vd, *n = vn;  \
+for (i = 0; i < opr_sz; ++i) {   \
+d[i] = OP(n[i], s);  \
+}\
+}
+
+#define DO_SUBR(X, Y)   (Y - X)
+
+DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
+DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
+DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
+DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
+
+DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
+DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
+DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
+DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
+
+DO_ZZI(sve_smini_b, int8_t, DO_MIN)
+DO_ZZI(sve_smini_h, int16_t, DO_MIN)
+DO_ZZI(sve_smini_s, int32_t, DO_MIN)
+DO_ZZI(sve_smini_d, int64_t, DO_MIN)
+
+DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
+DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
+DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
+DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
+
+DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
+DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
+DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
+DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
+
+#undef DO_ZZI
+
 #undef DO_AND
 #undef DO_ORR
 #undef DO_EOR
@@ -817,6 +857,7 @@ DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
 #undef DO_ASR
 #undef DO_LSR
 #undef DO_LSL
+#undef DO_SUBR
 
 /* Similar to the ARM LastActiveElement pseudocode function, except the
result is multiplied by the element size.  This includes the not found
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index 7571d02237..72abcb543a 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -81,6 +81,11 @@ static inline int expand_imm_sh8s(int x)
 return (int8_t)x << (x & 0x100 ? 8 : 0);
 }
 
+static inline int expand_imm_sh8u(int x)
+{
+return (uint8_t)x << (x & 0x100 ? 8 : 0);
+}
+
 /*
  * Include the generated decoder.
  */
@@ -2974,6 +2979,136 @@ static void trans_DUP_i(DisasContext *s, arg_DUP_i *a, 
uint32_t insn)
 tcg_gen_gvec_dup64i(dofs, vsz, vsz, dup_const(a->esz, a->imm));
 }
 
+static void trans_ADD_zzi(DisasContext *s, arg_rri_esz *a, uint32_t insn)
+{
+unsigned vsz = vec_full_reg_size(s);
+

[Qemu-devel] [PATCH v2 62/67] target/arm: Implement SVE FP Compare with Zero Group

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h| 42 ++
 target/arm/sve_helper.c| 45 +
 target/arm/translate-sve.c | 41 +
 target/arm/sve.decode  | 10 ++
 4 files changed, 138 insertions(+)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index c07b2245ba..696c97648b 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -767,6 +767,48 @@ DEF_HELPER_FLAGS_5(sve_fadda_s, TCG_CALL_NO_RWG,
 DEF_HELPER_FLAGS_5(sve_fadda_d, TCG_CALL_NO_RWG,
i64, i64, ptr, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_5(sve_fcmge0_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcmge0_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcmge0_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve_fcmgt0_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcmgt0_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcmgt0_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve_fcmlt0_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcmlt0_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcmlt0_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve_fcmle0_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcmle0_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcmle0_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve_fcmeq0_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcmeq0_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcmeq0_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve_fcmne0_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcmne0_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcmne0_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
 DEF_HELPER_FLAGS_6(sve_fadd_h, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_6(sve_fadd_s, TCG_CALL_NO_RWG,
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index 29deefcd86..6a052ce9ad 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -3270,6 +3270,8 @@ void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, 
  \
 
 #define DO_FCMGE(TYPE, X, Y, ST)  TYPE##_compare(Y, X, ST) <= 0
 #define DO_FCMGT(TYPE, X, Y, ST)  TYPE##_compare(Y, X, ST) < 0
+#define DO_FCMLE(TYPE, X, Y, ST)  TYPE##_compare(X, Y, ST) <= 0
+#define DO_FCMLT(TYPE, X, Y, ST)  TYPE##_compare(X, Y, ST) < 0
 #define DO_FCMEQ(TYPE, X, Y, ST)  TYPE##_compare_quiet(X, Y, ST) == 0
 #define DO_FCMNE(TYPE, X, Y, ST)  TYPE##_compare_quiet(X, Y, ST) != 0
 #define DO_FCMUO(TYPE, X, Y, ST)  \
@@ -3293,6 +3295,49 @@ DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
 #undef DO_FPCMP_PPZZ_H
 #undef DO_FPCMP_PPZZ
 
+/* One operand floating-point comparison against zero, controlled
+ * by a predicate.
+ */
+#define DO_FPCMP_PPZ0(NAME, TYPE, H, OP)   \
+void HELPER(NAME)(void *vd, void *vn, void *vg,\
+  void *status, uint32_t desc) \
+{  \
+intptr_t opr_sz = simd_oprsz(desc);\
+intptr_t i = opr_sz, j = ((opr_sz - 1) & -64) >> 3;\
+do {   \
+uint64_t out = 0;  \
+uint64_t pg = *(uint64_t *)(vg + j);   \
+do {   \
+i -= sizeof(TYPE), out <<= sizeof(TYPE);   \
+if ((pg >> (i & 63)) & 1) {\
+TYPE nn = *(TYPE *)(vn + H(i));\
+out |= OP(TYPE, nn, 0, status);\
+}  \
+} while (i & 63);  \
+*(uint64_t *)(vd + j) = out;   \
+j -= 8;\
+} while (i > 0);   \
+}
+
+#define DO_FPCMP_PPZ0_H(NAME, OP) \
+DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
+#define DO_FPCMP_PPZ0_S(NAME, OP) \
+DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
+#define DO_FPCMP_PPZ0_D(NAME, OP) \
+DO_FPCMP_PPZ0(NAME##_d, f

Re: [Qemu-devel] [PATCH 3/3] arm/vexpress: Add proper display connector emulation

2018-02-17 Thread Philippe Mathieu-Daudé
Hi Linus,

On 02/17/2018 11:00 AM, Linus Walleij wrote:
> This adds the SiI9022 and EDID I2C devices to the ARM Versatile
> Express machine, and selects the two I2C devices necessary in the
> arm-softmmy.mak configuration so everything will build smoothly.
> 
> I am implementing proper handling of the graphics in the Linux
> kernel and adding proper emulation of SiI9022 and EDID makes the
> driver probe as nicely as before, retrieveing the resolutions
> supported by the "QEMU monitor" and overall just working nice.
> 
> The assignment of the SiI9022 at address 0x39 and the EDID
> DDC I2C at address 0x50 is not strictly correct: the DDC I2C
> is there all the time but in the actual component it only
> appears once activated inside the SiI9022, so ideally it should
> be added and removed to the bus by the SiI9022. However for this
> purpose it works fine to just have it around.

This seems easier to just do it now rather than postpone :)

In your patch #2:

static void sii9022_realize(DeviceState *dev, Error **errp)
{
I2CBus *bus;

bus = I2C_BUS(qdev_get_parent_bus(dev));
i2c_create_slave(bus, TYPE_I2CDDC, 0x50);
}

static void sii9022_class_init(ObjectClass *klass, void *data)
{
DeviceClass *dc = DEVICE_CLASS(klass);

...
dc->realize = sii9022_realize;
}

> 
> Signed-off-by: Linus Walleij 
> ---
>  default-configs/arm-softmmu.mak | 2 ++
>  hw/arm/vexpress.c   | 7 ++-
>  2 files changed, 8 insertions(+), 1 deletion(-)
> 
> diff --git a/default-configs/arm-softmmu.mak b/default-configs/arm-softmmu.mak
> index ca34cf446242..54f855d07206 100644
> --- a/default-configs/arm-softmmu.mak
> +++ b/default-configs/arm-softmmu.mak
> @@ -21,6 +21,8 @@ CONFIG_STELLARIS_INPUT=y
>  CONFIG_STELLARIS_ENET=y
>  CONFIG_SSD0303=y
>  CONFIG_SSD0323=y
> +CONFIG_DDC=y
> +CONFIG_SII9022=y
>  CONFIG_ADS7846=y
>  CONFIG_MAX111X=y
>  CONFIG_SSI=y
> diff --git a/hw/arm/vexpress.c b/hw/arm/vexpress.c
> index dc5928ae1ab5..d6c912c97684 100644
> --- a/hw/arm/vexpress.c
> +++ b/hw/arm/vexpress.c
> @@ -29,6 +29,7 @@
>  #include "hw/arm/arm.h"
>  #include "hw/arm/primecell.h"
>  #include "hw/devices.h"
> +#include "hw/i2c/i2c.h"
>  #include "net/net.h"
>  #include "sysemu/sysemu.h"
>  #include "hw/boards.h"
> @@ -537,6 +538,7 @@ static void vexpress_common_init(MachineState *machine)
>  uint32_t sys_id;
>  DriveInfo *dinfo;
>  pflash_t *pflash0;
> +I2CBus *i2c;
>  ram_addr_t vram_size, sram_size;
>  MemoryRegion *sysmem = get_system_memory();
>  MemoryRegion *vram = g_new(MemoryRegion, 1);
> @@ -628,7 +630,10 @@ static void vexpress_common_init(MachineState *machine)
>  sysbus_create_simple("sp804", map[VE_TIMER01], pic[2]);
>  sysbus_create_simple("sp804", map[VE_TIMER23], pic[3]);
>  
> -/* VE_SERIALDVI: not modelled */
> +dev = sysbus_create_simple("versatile_i2c", map[VE_SERIALDVI], NULL);
> +i2c = (I2CBus *)qdev_get_child_bus(dev, "i2c");
> +i2c_create_slave(i2c, "sii9022", 0x39);
> +i2c_create_slave(i2c, "i2c-ddc", 0x50);
>  
>  sysbus_create_simple("pl031", map[VE_RTC], pic[4]); /* RTC */
>  
> 



[Qemu-devel] [PATCH v2 51/67] target/arm: Implement SVE load and broadcast element

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h|  5 +
 target/arm/sve_helper.c| 43 
 target/arm/translate-sve.c | 55 +-
 target/arm/sve.decode  |  5 +
 4 files changed, 107 insertions(+), 1 deletion(-)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index c4502256d5..6c640a92ff 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -274,6 +274,11 @@ DEF_HELPER_FLAGS_3(sve_clr_h, TCG_CALL_NO_RWG, void, ptr, 
ptr, i32)
 DEF_HELPER_FLAGS_3(sve_clr_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
 DEF_HELPER_FLAGS_3(sve_clr_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_3(sve_clri_b, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_clri_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_clri_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_clri_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
 DEF_HELPER_FLAGS_4(sve_asr_zpzi_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(sve_asr_zpzi_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(sve_asr_zpzi_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index 0e2b3091b0..a7dc6f6164 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -994,6 +994,49 @@ void HELPER(sve_clr_d)(void *vd, void *vg, uint32_t desc)
 }
 }
 
+/* Store zero into every inactive element of Zd.  */
+void HELPER(sve_clri_b)(void *vd, void *vg, uint32_t desc)
+{
+intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+uint64_t *d = vd;
+uint8_t *pg = vg;
+for (i = 0; i < opr_sz; i += 1) {
+d[i] &= expand_pred_b(pg[H1(i)]);
+}
+}
+
+void HELPER(sve_clri_h)(void *vd, void *vg, uint32_t desc)
+{
+intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+uint64_t *d = vd;
+uint8_t *pg = vg;
+for (i = 0; i < opr_sz; i += 1) {
+d[i] &= expand_pred_h(pg[H1(i)]);
+}
+}
+
+void HELPER(sve_clri_s)(void *vd, void *vg, uint32_t desc)
+{
+intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+uint64_t *d = vd;
+uint8_t *pg = vg;
+for (i = 0; i < opr_sz; i += 1) {
+d[i] &= expand_pred_s(pg[H1(i)]);
+}
+}
+
+void HELPER(sve_clri_d)(void *vd, void *vg, uint32_t desc)
+{
+intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+uint64_t *d = vd;
+uint8_t *pg = vg;
+for (i = 0; i < opr_sz; i += 1) {
+if (!(pg[H1(i)] & 1)) {
+d[i] = 0;
+}
+}
+}
+
 /* Three-operand expander, immediate operand, controlled by a predicate.
  */
 #define DO_ZPZI(NAME, TYPE, H, OP)  \
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index 32f0340738..b000a2482e 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -584,6 +584,19 @@ static void do_clr_zp(DisasContext *s, int rd, int pg, int 
esz)
vsz, vsz, 0, fns[esz]);
 }
 
+/* Store zero into every inactive element of Zd.  */
+static void do_clr_inactive_zp(DisasContext *s, int rd, int pg, int esz)
+{
+static gen_helper_gvec_2 * const fns[4] = {
+gen_helper_sve_clri_b, gen_helper_sve_clri_h,
+gen_helper_sve_clri_s, gen_helper_sve_clri_d,
+};
+unsigned vsz = vec_full_reg_size(s);
+tcg_gen_gvec_2_ool(vec_full_reg_offset(s, rd),
+   pred_full_reg_offset(s, pg),
+   vsz, vsz, 0, fns[esz]);
+}
+
 static void do_zpzi_ool(DisasContext *s, arg_rpri_esz *a,
 gen_helper_gvec_3 *fn)
 {
@@ -3506,7 +3519,7 @@ static void trans_LDR_pri(DisasContext *s, arg_rri *a, 
uint32_t insn)
  *** SVE Memory - Contiguous Load Group
  */
 
-/* The memory element size of dtype.  */
+/* The memory mode of the dtype.  */
 static const TCGMemOp dtype_mop[16] = {
 MO_UB, MO_UB, MO_UB, MO_UB,
 MO_SL, MO_UW, MO_UW, MO_UW,
@@ -3671,6 +3684,46 @@ static void trans_LD1RQ_zpri(DisasContext *s, 
arg_rpri_load *a, uint32_t insn)
 do_ldrq(s, a->rd, a->pg, addr, dtype_msz(a->dtype));
 }
 
+/* Load and broadcast element.  */
+static void trans_LD1R_zpri(DisasContext *s, arg_rpri_load *a, uint32_t insn)
+{
+unsigned vsz = vec_full_reg_size(s);
+unsigned psz = pred_full_reg_size(s);
+unsigned esz = dtype_esz[a->dtype];
+TCGLabel *over = gen_new_label();
+TCGv_i64 temp;
+
+/* If the guarding predicate has no bits set, no load occurs.  */
+if (psz <= 8) {
+temp = tcg_temp_new_i64();
+tcg_gen_ld_i64(temp, cpu_env, pred_full_reg_offset(s, a->pg));
+tcg_gen_andi_i64(temp, temp,
+ deposit64(0, 0, psz * 8, pred_esz_masks[esz]));
+tcg_gen_brcondi_i64(TCG_COND_EQ, temp, 0, over);
+tcg_temp_free_i64(temp);
+} else {
+TCGv_i32 t32 = tcg_temp_new_i32();
+find_last_active(s, t32, esz, a->pg);
+tcg_gen_brcondi_i32(TCG_COND_LT, t32, 0, over);
+tcg

[Qemu-devel] [PATCH v2 46/67] target/arm: Implement SVE load and broadcast quadword

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/translate-sve.c | 51 ++
 target/arm/sve.decode  |  9 
 2 files changed, 60 insertions(+)

diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index fda9a56fd5..7b21102b7e 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -3398,6 +3398,57 @@ static void trans_LDNF1_zpri(DisasContext *s, 
arg_rpri_load *a, uint32_t insn)
 trans_LD_zpri(s, a, insn);
 }
 
+static void do_ldrq(DisasContext *s, int zt, int pg, TCGv_i64 addr, int msz)
+{
+static gen_helper_gvec_mem * const fns[4] = {
+gen_helper_sve_ld1bb_r, gen_helper_sve_ld1hh_r,
+gen_helper_sve_ld1ss_r, gen_helper_sve_ld1dd_r,
+};
+unsigned vsz = vec_full_reg_size(s);
+TCGv_ptr t_pg;
+TCGv_i32 desc;
+
+/* Load the first quadword using the normal predicated load helpers.  */
+desc = tcg_const_i32(simd_desc(16, 16, zt));
+t_pg = tcg_temp_new_ptr();
+
+tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, pg));
+fns[msz](cpu_env, t_pg, addr, desc);
+
+tcg_temp_free_ptr(t_pg);
+tcg_temp_free_i32(desc);
+
+/* Replicate that first quadword.  */
+if (vsz > 16) {
+unsigned dofs = vec_full_reg_offset(s, zt);
+tcg_gen_gvec_dup_mem(4, dofs + 16, dofs, vsz - 16, vsz - 16);
+}
+}
+
+static void trans_LD1RQ_zprr(DisasContext *s, arg_rprr_load *a, uint32_t insn)
+{
+TCGv_i64 addr;
+int msz = dtype_msz(a->dtype);
+
+if (a->rm == 31) {
+unallocated_encoding(s);
+return;
+}
+
+addr = new_tmp_a64(s);
+tcg_gen_shli_i64(addr, cpu_reg(s, a->rm), msz);
+tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, a->rn));
+do_ldrq(s, a->rd, a->pg, addr, msz);
+}
+
+static void trans_LD1RQ_zpri(DisasContext *s, arg_rpri_load *a, uint32_t insn)
+{
+TCGv_i64 addr = new_tmp_a64(s);
+
+tcg_gen_addi_i64(addr, cpu_reg_sp(s, a->rn), a->imm * 16);
+do_ldrq(s, a->rd, a->pg, addr, dtype_msz(a->dtype));
+}
+
 static void do_st_zpa(DisasContext *s, int zt, int pg, TCGv_i64 addr,
   int msz, int esz, int nreg)
 {
diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index 41b8cd8746..6c906e25e9 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -723,6 +723,15 @@ LD_zprr1010010 .. nreg:2 . 110 ... . 
. @rprr_load_msz
 # LD2B, LD2H, LD2W, LD2D; etc.
 LD_zpri1010010 .. nreg:2 0 111 ... . . 
@rpri_load_msz
 
+# SVE load and broadcast quadword (scalar plus scalar)
+LD1RQ_zprr 1010010 .. 00 . 000 ... . . \
+   @rprr_load_msz nreg=0
+
+# SVE load and broadcast quadword (scalar plus immediate)
+# LD1RQB, LD1RQH, LD1RQS, LD1RQD
+LD1RQ_zpri 1010010 .. 00 0 001 ... . . \
+   @rpri_load_msz nreg=0
+
 ### SVE Memory Store Group
 
 # SVE contiguous store (scalar plus immediate)
-- 
2.14.3




[Qemu-devel] [PATCH v2 49/67] target/arm: Implement SVE FP Multiply-Add Group

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h| 16 ++
 target/arm/sve_helper.c| 53 ++
 target/arm/translate-sve.c | 41 +++
 target/arm/sve.decode  | 17 +++
 4 files changed, 127 insertions(+)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index 84d0a8978c..a95f077c7f 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -827,6 +827,22 @@ DEF_HELPER_FLAGS_5(sve_ucvt_ds, TCG_CALL_NO_RWG,
 DEF_HELPER_FLAGS_5(sve_ucvt_dd, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_3(sve_fmla_zpzzz_h, TCG_CALL_NO_RWG, void, env, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_fmla_zpzzz_s, TCG_CALL_NO_RWG, void, env, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_fmla_zpzzz_d, TCG_CALL_NO_RWG, void, env, ptr, i32)
+
+DEF_HELPER_FLAGS_3(sve_fmls_zpzzz_h, TCG_CALL_NO_RWG, void, env, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_fmls_zpzzz_s, TCG_CALL_NO_RWG, void, env, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_fmls_zpzzz_d, TCG_CALL_NO_RWG, void, env, ptr, i32)
+
+DEF_HELPER_FLAGS_3(sve_fnmla_zpzzz_h, TCG_CALL_NO_RWG, void, env, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_fnmla_zpzzz_s, TCG_CALL_NO_RWG, void, env, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_fnmla_zpzzz_d, TCG_CALL_NO_RWG, void, env, ptr, i32)
+
+DEF_HELPER_FLAGS_3(sve_fnmls_zpzzz_h, TCG_CALL_NO_RWG, void, env, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_fnmls_zpzzz_s, TCG_CALL_NO_RWG, void, env, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_fnmls_zpzzz_d, TCG_CALL_NO_RWG, void, env, ptr, i32)
+
 DEF_HELPER_FLAGS_4(sve_ld1bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
 DEF_HELPER_FLAGS_4(sve_ld2bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
 DEF_HELPER_FLAGS_4(sve_ld3bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index d80babfae7..6622275b44 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -2948,6 +2948,59 @@ DO_ZPZ_FP_D(sve_ucvt_dd, uint64_t, uint64_to_float64)
 #undef DO_ZPZ_FP
 #undef DO_ZPZ_FP_D
 
+/* 4-operand predicated multiply-add.  This requires 7 operands to pass
+ * "properly", so we need to encode some of the registers into DESC.
+ */
+QEMU_BUILD_BUG_ON(SIMD_DATA_SHIFT + 20 > 32);
+
+#define DO_FMLA(NAME, N, H, NEG1, NEG3) \
+void HELPER(NAME)(CPUARMState *env, void *vg, uint32_t desc)\
+{   \
+intptr_t i = 0, opr_sz = simd_oprsz(desc);  \
+unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);  \
+unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);  \
+unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5); \
+unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5); \
+void *vd = &env->vfp.zregs[rd]; \
+void *vn = &env->vfp.zregs[rn]; \
+void *vm = &env->vfp.zregs[rm]; \
+void *va = &env->vfp.zregs[ra]; \
+do {\
+uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
+do {\
+if (likely(pg & 1)) {   \
+float##N e1 = *(uint##N##_t *)(vn + H(i));  \
+float##N e2 = *(uint##N##_t *)(vm + H(i));  \
+float##N e3 = *(uint##N##_t *)(va + H(i));  \
+float##N r; \
+if (NEG1) e1 = float##N##_chs(e1);  \
+if (NEG3) e3 = float##N##_chs(e3);  \
+r = float##N##_muladd(e1, e2, e3, 0, &env->vfp.fp_status);  \
+*(uint##N##_t *)(vd + H(i)) = r;\
+}   \
+i += sizeof(float##N), pg >>= sizeof(float##N); \
+} while (i & 15);   \
+} while (i < opr_sz);   \
+}
+
+DO_FMLA(sve_fmla_zpzzz_h, 16, H1_2, 0, 0)
+DO_FMLA(sve_fmla_zpzzz_s, 32, H1_4, 0, 0)
+DO_FMLA(sve_fmla_zpzzz_d, 64, , 0, 0)
+
+DO_FMLA(sve_fmls_zpzzz_h, 16, H1_2, 0, 1)
+DO_FMLA(sve_fmls_zpzzz_s, 32, H1_4, 0, 1)
+DO_FMLA(sve_fmls_zpzzz_d, 64, , 0, 1)
+
+DO_FMLA(sve_fnmla_zpzzz_h, 16, H1_2, 1, 0)
+DO_FMLA(sve_fnmla_zpzzz_s, 32, H1_4, 1, 0)
+DO_FMLA(sve_fnmla_zpzzz_d, 64, , 1, 0)
+
+DO_FMLA(sve_fnmls_zpzzz_h, 16, H1_2, 1, 1)
+DO_FMLA(sve_fnmls_zpzzz_s, 32, H1_4, 1, 1)
+DO_FMLA(sve_fnmls_zpzzz_d, 64, , 1,

[Qemu-devel] [PATCH v2 60/67] target/arm: Implement SVE FP Fast Reduction Group

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h| 35 ++
 target/arm/sve_helper.c| 61 ++
 target/arm/translate-sve.c | 55 +
 target/arm/sve.decode  |  8 ++
 4 files changed, 159 insertions(+)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index 7ada12687b..c07b2245ba 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -725,6 +725,41 @@ DEF_HELPER_FLAGS_5(gvec_rsqrts_s, TCG_CALL_NO_RWG,
 DEF_HELPER_FLAGS_5(gvec_rsqrts_d, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_4(sve_faddv_h, TCG_CALL_NO_RWG,
+   i64, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_faddv_s, TCG_CALL_NO_RWG,
+   i64, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_faddv_d, TCG_CALL_NO_RWG,
+   i64, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve_fmaxnmv_h, TCG_CALL_NO_RWG,
+   i64, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_fmaxnmv_s, TCG_CALL_NO_RWG,
+   i64, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_fmaxnmv_d, TCG_CALL_NO_RWG,
+   i64, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve_fminnmv_h, TCG_CALL_NO_RWG,
+   i64, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_fminnmv_s, TCG_CALL_NO_RWG,
+   i64, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_fminnmv_d, TCG_CALL_NO_RWG,
+   i64, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve_fmaxv_h, TCG_CALL_NO_RWG,
+   i64, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_fmaxv_s, TCG_CALL_NO_RWG,
+   i64, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_fmaxv_d, TCG_CALL_NO_RWG,
+   i64, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve_fminv_h, TCG_CALL_NO_RWG,
+   i64, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_fminv_s, TCG_CALL_NO_RWG,
+   i64, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_fminv_d, TCG_CALL_NO_RWG,
+   i64, ptr, ptr, ptr, i32)
+
 DEF_HELPER_FLAGS_5(sve_fadda_h, TCG_CALL_NO_RWG,
i64, i64, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_5(sve_fadda_s, TCG_CALL_NO_RWG,
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index 9378c8f0b2..29deefcd86 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -2832,6 +2832,67 @@ uint32_t HELPER(sve_while)(void *vd, uint32_t count, 
uint32_t pred_desc)
 return predtest_ones(d, oprsz, esz_mask);
 }
 
+/* Recursive reduction on a function;
+ * C.f. the ARM ARM function ReducePredicated.
+ *
+ * While it would be possible to write this without the DATA temporary,
+ * it is much simpler to process the predicate register this way.
+ * The recursion is bounded to depth 7 (128 fp16 elements), so there's
+ * little to gain with a more complex non-recursive form.
+ */
+#define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \
+static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
+{ \
+if (n == 1) { \
+return *data; \
+} else {  \
+uintptr_t half = n / 2;   \
+TYPE lo = NAME##_reduce(data, status, half);  \
+TYPE hi = NAME##_reduce(data + half, status, half);   \
+return TYPE##_##FUNC(lo, hi, status); \
+} \
+} \
+uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc)\
+{ \
+uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_maxsz(desc);  \
+TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)];   \
+for (i = 0; i < oprsz; ) {\
+uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));   \
+do {  \
+TYPE nn = *(TYPE *)(vn + H(i));   \
+*(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT);  \
+i += sizeof(TYPE), pg >>= sizeof(TYPE);   \
+} while (i & 15); \
+} \
+for (; i < maxsz; i += sizeof(TYPE)) {\
+*(TYPE *)((void *)data + i) = IDENT;  \
+} \
+return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \
+}
+
+DO_REDUC

[Qemu-devel] [PATCH v9 01/14] hw/arm/smmu-common: smmu base device and datatypes

2018-02-17 Thread Eric Auger
The patch introduces the smmu base device and class for the ARM
smmu. Devices for specific versions will be derived from this
base device.

We also introduce some important datatypes.

Signed-off-by: Eric Auger 
Signed-off-by: Prem Mallappa 

---
v8 -> v9:
- remove page walk callback type from this patch (vhost related)
- add a new hash table for caching configuration data
- add reset function
- add asid

v7 -> v8:
- add bus_num property
- add primary-bus property
- add realize and remove instance_init
- rename TYPE and related macros to match naming convention using
  for GIC
- add SMMUPageTableWalkEventInfo
- tt[2] in translation config

v3 -> v4:
- added smmu_find_as_from_bus_num
- SMMU_PCI_BUS_MAX and SMMU_PCI_DEVFN_MAX in smmu-common header
- new fields in SMMUState:
  - iommu_ops, smmu_as_by_busptr, smmu_as_by_bus_num
- add aa64[] field in SMMUTransCfg

v3:
- moved the base code in a separate patch to ease the review.
- clearer separation between base class and smmuv3 class
- translate_* only implemented as class methods

Conflicts:
default-configs/aarch64-softmmu.mak
---
 default-configs/aarch64-softmmu.mak |   1 +
 hw/arm/Makefile.objs|   1 +
 hw/arm/smmu-common.c|  80 +++
 include/hw/arm/smmu-common.h| 124 
 4 files changed, 206 insertions(+)
 create mode 100644 hw/arm/smmu-common.c
 create mode 100644 include/hw/arm/smmu-common.h

diff --git a/default-configs/aarch64-softmmu.mak 
b/default-configs/aarch64-softmmu.mak
index 9ddccf8..6f790f0 100644
--- a/default-configs/aarch64-softmmu.mak
+++ b/default-configs/aarch64-softmmu.mak
@@ -8,3 +8,4 @@ CONFIG_DDC=y
 CONFIG_DPCD=y
 CONFIG_XLNX_ZYNQMP=y
 CONFIG_XLNX_ZYNQMP_ARM=y
+CONFIG_ARM_SMMUV3=y
diff --git a/hw/arm/Makefile.objs b/hw/arm/Makefile.objs
index 1c896ba..c84c5ac 100644
--- a/hw/arm/Makefile.objs
+++ b/hw/arm/Makefile.objs
@@ -20,3 +20,4 @@ obj-$(CONFIG_FSL_IMX6) += fsl-imx6.o sabrelite.o
 obj-$(CONFIG_ASPEED_SOC) += aspeed_soc.o aspeed.o
 obj-$(CONFIG_MPS2) += mps2.o
 obj-$(CONFIG_MSF2) += msf2-soc.o msf2-som.o
+obj-$(CONFIG_ARM_SMMUV3) += smmu-common.o
diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c
new file mode 100644
index 000..86a5aab
--- /dev/null
+++ b/hw/arm/smmu-common.c
@@ -0,0 +1,80 @@
+/*
+ * Copyright (C) 2014-2016 Broadcom Corporation
+ * Copyright (c) 2017 Red Hat, Inc.
+ * Written by Prem Mallappa, Eric Auger
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Author: Prem Mallappa 
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "sysemu/sysemu.h"
+#include "exec/address-spaces.h"
+#include "trace.h"
+#include "exec/target_page.h"
+#include "qom/cpu.h"
+#include "hw/qdev-properties.h"
+#include "qapi/error.h"
+
+#include "qemu/error-report.h"
+#include "hw/arm/smmu-common.h"
+
+static void smmu_base_realize(DeviceState *dev, Error **errp)
+{
+SMMUState *s = ARM_SMMU(dev);
+
+s->configs = g_hash_table_new_full(NULL, NULL, NULL, g_free);
+s->iotlb = g_hash_table_new_full(NULL, NULL, NULL, g_free);
+}
+
+static void smmu_base_reset(DeviceState *dev)
+{
+SMMUState *s = ARM_SMMU(dev);
+
+g_hash_table_remove_all(s->configs);
+g_hash_table_remove_all(s->iotlb);
+}
+
+static Property smmu_dev_properties[] = {
+DEFINE_PROP_UINT8("bus_num", SMMUState, bus_num, 0),
+DEFINE_PROP_LINK("primary-bus", SMMUState, primary_bus, "PCI", PCIBus *),
+DEFINE_PROP_END_OF_LIST(),
+};
+
+static void smmu_base_class_init(ObjectClass *klass, void *data)
+{
+DeviceClass *dc = DEVICE_CLASS(klass);
+SMMUBaseClass *sbc = ARM_SMMU_CLASS(klass);
+
+dc->props = smmu_dev_properties;
+sbc->parent_realize = dc->realize;
+dc->realize = smmu_base_realize;
+dc->reset = smmu_base_reset;
+}
+
+static const TypeInfo smmu_base_info = {
+.name  = TYPE_ARM_SMMU,
+.parent= TYPE_SYS_BUS_DEVICE,
+.instance_size = sizeof(SMMUState),
+.class_data= NULL,
+.class_size= sizeof(SMMUBaseClass),
+.class_init= smmu_base_class_init,
+.abstract  = true,
+};
+
+static void smmu_base_register_types(void)
+{
+type_register_static(&smmu_base_info);
+}
+
+type_init(smmu_base_register_types)
+
diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h
new file mode 100644
index 000..8a9d931
--- /dev/null
+++ b/include/hw/arm/smmu-common.h
@@ -0,0 +1,124 @@
+/*
+ * ARM SMMU Support
+ *
+ * Copyright (C) 2015-2016 Broadcom Corporation
+ * Copyright (c) 2017 Red Hat, Inc.
+ * Written by Prem Mallappa, Eric Auger
+ *
+ * This program is free soft

[Qemu-devel] [PATCH v9 03/14] hw/arm/smmu-common: VMSAv8-64 page table walk

2018-02-17 Thread Eric Auger
This patch implements the page table walk for VMSAv8-64.

Signed-off-by: Eric Auger 

---
v8 -> v9:
- remove guest error log on PTE fetch fault
- rename  trace functions
- fix smmu_page_walk_level_res_invalid_pte last arg
- fix PTE_ADDRESS
- turn functions into macros
- make sure to return the actual pte access permission
  into tlbe->perm
- change proto of smmu_ptw*

v7 -> v8:
- rework get_pte
- use LOG_LEVEL_ERROR
- remove error checking in get_block_pte_address
- page table walk simplified (no VFIO replay anymore)
- handle PTW error events
- use dma_memory_read

v6 -> v7:
- fix wrong error handling in walk_page_table
- check perm in smmu_translate

v5 -> v6:
- use IOMMUMemoryRegion
- remove initial_lookup_level()
- fix block replay

v4 -> v5:
- add initial level in translation config
- implement block pte
- rename must_translate into nofail
- introduce call_entry_hook
- small changes to dynamic traces
- smmu_page_walk code moved from smmuv3.c to this file
- remove smmu_translate*

v3 -> v4:
- reworked page table walk to prepare for VFIO integration
  (capability to scan a range of IOVA). Same function is used
  for translate for a single iova. This is largely inspired
  from intel_iommu.c
- as the translate function was not straightforward to me,
  I tried to stick more closely to the VMSA spec.
- remove support of nested stage (kernel driver does not
  support it anyway)
- use error_report and trace events
- add aa64[] field in SMMUTransCfg
---
 hw/arm/smmu-common.c | 232 +++
 hw/arm/smmu-internal.h   |  96 ++
 hw/arm/trace-events  |  10 ++
 include/hw/arm/smmu-common.h |   6 ++
 4 files changed, 344 insertions(+)
 create mode 100644 hw/arm/smmu-internal.h

diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c
index d0516dc..24cc4ba 100644
--- a/hw/arm/smmu-common.c
+++ b/hw/arm/smmu-common.c
@@ -27,6 +27,238 @@
 
 #include "qemu/error-report.h"
 #include "hw/arm/smmu-common.h"
+#include "smmu-internal.h"
+
+/* VMSAv8-64 Translation */
+
+/**
+ * get_pte - Get the content of a page table entry located t
+ * @base_addr[@index]
+ */
+static int get_pte(dma_addr_t baseaddr, uint32_t index, uint64_t *pte,
+   SMMUPTWEventInfo *info)
+{
+int ret;
+dma_addr_t addr = baseaddr + index * sizeof(*pte);
+
+ret = dma_memory_read(&address_space_memory, addr,
+  (uint8_t *)pte, sizeof(*pte));
+
+if (ret != MEMTX_OK) {
+info->type = SMMU_PTW_ERR_WALK_EABT;
+info->addr = addr;
+return -EINVAL;
+}
+trace_smmu_get_pte(baseaddr, index, addr, *pte);
+return 0;
+}
+
+/* VMSAv8-64 Translation Table Format Descriptor Decoding */
+
+/**
+ * get_page_pte_address - returns the L3 descriptor output address,
+ * ie. the page frame
+ * ARM ARM spec: Figure D4-17 VMSAv8-64 level 3 descriptor format
+ */
+static inline hwaddr get_page_pte_address(uint64_t pte, int granule_sz)
+{
+return PTE_ADDRESS(pte, granule_sz);
+}
+
+/**
+ * get_table_pte_address - return table descriptor output address,
+ * ie. address of next level table
+ * ARM ARM Figure D4-16 VMSAv8-64 level0, level1, and level 2 descriptor 
formats
+ */
+static inline hwaddr get_table_pte_address(uint64_t pte, int granule_sz)
+{
+return PTE_ADDRESS(pte, granule_sz);
+}
+
+/**
+ * get_block_pte_address - return block descriptor output address and block 
size
+ * ARM ARM Figure D4-16 VMSAv8-64 level0, level1, and level 2 descriptor 
formats
+ */
+static hwaddr get_block_pte_address(uint64_t pte, int level, int granule_sz,
+uint64_t *bsz)
+{
+int n = 0;
+
+switch (granule_sz) {
+case 12:
+if (level == 1) {
+n = 30;
+} else if (level == 2) {
+n = 21;
+}
+break;
+case 14:
+if (level == 2) {
+n = 25;
+}
+break;
+case 16:
+if (level == 2) {
+n = 29;
+}
+break;
+}
+if (!n) {
+error_setg(&error_fatal,
+   "wrong granule/level combination (%d/%d)",
+   granule_sz, level);
+}
+*bsz = 1 << n;
+return PTE_ADDRESS(pte, n);
+}
+
+static inline bool check_perm(int access_attrs, int mem_attrs)
+{
+if (((access_attrs & IOMMU_RO) && !(mem_attrs & IOMMU_RO)) ||
+((access_attrs & IOMMU_WO) && !(mem_attrs & IOMMU_WO))) {
+return false;
+}
+return true;
+}
+
+SMMUTransTableInfo *select_tt(SMMUTransCfg *cfg, dma_addr_t iova)
+{
+if (!extract64(iova, 64 - cfg->tt[0].tsz, cfg->tt[0].tsz - cfg->tbi)) {
+return &cfg->tt[0];
+}
+return &cfg->tt[1];
+}
+
+/**
+ * smmu_ptw_64 - VMSAv8-64 Walk of the page tables for a given IOVA
+ * @cfg: translation config
+ * @iova: iova to translate
+ * @perm: access type
+ * @tlbe: IOMMUTLBEntry (out)
+ * @info: handle to an error info
+ *
+ * Return 0 on success, < 0 on error. In case of error, @info is

[Qemu-devel] [PATCH v2 50/67] target/arm: Implement SVE Floating Point Accumulating Reduction Group

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h|  7 ++
 target/arm/sve_helper.c| 56 ++
 target/arm/translate-sve.c | 42 ++
 target/arm/sve.decode  |  5 +
 4 files changed, 110 insertions(+)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index a95f077c7f..c4502256d5 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -720,6 +720,13 @@ DEF_HELPER_FLAGS_5(gvec_rsqrts_s, TCG_CALL_NO_RWG,
 DEF_HELPER_FLAGS_5(gvec_rsqrts_d, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_5(sve_fadda_h, TCG_CALL_NO_RWG,
+   i64, i64, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fadda_s, TCG_CALL_NO_RWG,
+   i64, i64, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fadda_d, TCG_CALL_NO_RWG,
+   i64, i64, ptr, ptr, ptr, i32)
+
 DEF_HELPER_FLAGS_6(sve_fadd_h, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_6(sve_fadd_s, TCG_CALL_NO_RWG,
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index 6622275b44..0e2b3091b0 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -2789,6 +2789,62 @@ uint32_t HELPER(sve_while)(void *vd, uint32_t count, 
uint32_t pred_desc)
 return predtest_ones(d, oprsz, esz_mask);
 }
 
+uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
+ void *status, uint32_t desc)
+{
+intptr_t i = 0, opr_sz = simd_oprsz(desc);
+float16 result = nn;
+
+do {
+uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
+do {
+if (pg & 1) {
+float16 mm = *(float16 *)(vm + H1_2(i));
+result = float16_add(result, mm, status);
+}
+i += sizeof(float16), pg >>= sizeof(float16);
+} while (i & 15);
+} while (i < opr_sz);
+
+return result;
+}
+
+uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
+ void *status, uint32_t desc)
+{
+intptr_t i = 0, opr_sz = simd_oprsz(desc);
+float32 result = nn;
+
+do {
+uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
+do {
+if (pg & 1) {
+float32 mm = *(float32 *)(vm + H1_2(i));
+result = float32_add(result, mm, status);
+}
+i += sizeof(float32), pg >>= sizeof(float32);
+} while (i & 15);
+} while (i < opr_sz);
+
+return result;
+}
+
+uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
+ void *status, uint32_t desc)
+{
+intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
+uint64_t *m = vm;
+uint8_t *pg = vg;
+
+for (i = 0; i < opr_sz; i++) {
+if (pg[H1(i)] & 1) {
+nn = float64_add(nn, m[i], status);
+}
+}
+
+return nn;
+}
+
 /* Fully general three-operand expander, controlled by a predicate,
  * With the extra float_status parameter.
  */
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index 3124368fb5..32f0340738 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -3120,6 +3120,48 @@ DO_ZZI(UMIN, umin)
 
 #undef DO_ZZI
 
+/*
+ *** SVE Floating Point Accumulating Reduction Group
+ */
+
+static void trans_FADDA(DisasContext *s, arg_rprr_esz *a, uint32_t insn)
+{
+typedef void fadda_fn(TCGv_i64, TCGv_i64, TCGv_ptr,
+  TCGv_ptr, TCGv_ptr, TCGv_i32);
+static fadda_fn * const fns[3] = {
+gen_helper_sve_fadda_h,
+gen_helper_sve_fadda_s,
+gen_helper_sve_fadda_d,
+};
+unsigned vsz = vec_full_reg_size(s);
+TCGv_ptr t_rm, t_pg, t_fpst;
+TCGv_i64 t_val;
+TCGv_i32 t_desc;
+
+if (a->esz == 0) {
+unallocated_encoding(s);
+return;
+}
+
+t_val = load_esz(cpu_env, vec_reg_offset(s, a->rn, 0, a->esz), a->esz);
+t_rm = tcg_temp_new_ptr();
+t_pg = tcg_temp_new_ptr();
+tcg_gen_addi_ptr(t_rm, cpu_env, vec_full_reg_offset(s, a->rm));
+tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, a->pg));
+t_fpst = get_fpstatus_ptr(a->esz == MO_16);
+t_desc = tcg_const_i32(simd_desc(vsz, vsz, 0));
+
+fns[a->esz - 1](t_val, t_val, t_rm, t_pg, t_fpst, t_desc);
+
+tcg_temp_free_i32(t_desc);
+tcg_temp_free_ptr(t_fpst);
+tcg_temp_free_ptr(t_pg);
+tcg_temp_free_ptr(t_rm);
+
+write_fp_dreg(s, a->rd, t_val);
+tcg_temp_free_i64(t_val);
+}
+
 /*
  *** SVE Floating Point Arithmetic - Unpredicated Group
  */
diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index 817833f96e..95a290aed0 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -684,6 +684,11 @@ UMIN_zzi   00100101 .. 101 011 110  .  
@rdn_i8u
 # SVE integer multiply immediate (unpredicated)
 MUL_zzi00100101 .. 110 000 110  .  @rdn_i8s
 
+### SVE

[Qemu-devel] [PATCH v2 65/67] target/arm: Implement SVE floating-point convert to integer

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h| 30 
 target/arm/sve_helper.c| 16 +++
 target/arm/translate-sve.c | 70 ++
 target/arm/sve.decode  | 16 +++
 4 files changed, 132 insertions(+)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index bac4bfdc60..0f5fea9045 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -955,6 +955,36 @@ DEF_HELPER_FLAGS_5(sve_fcvt_hd, TCG_CALL_NO_RWG,
 DEF_HELPER_FLAGS_5(sve_fcvt_sd, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_5(sve_fcvtzs_hh, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcvtzs_hs, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcvtzs_ss, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcvtzs_ds, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcvtzs_hd, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcvtzs_sd, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcvtzs_dd, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve_fcvtzu_hh, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcvtzu_hs, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcvtzu_ss, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcvtzu_ds, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcvtzu_hd, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcvtzu_sd, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcvtzu_dd, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
 DEF_HELPER_FLAGS_5(sve_scvt_hh, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_5(sve_scvt_sh, TCG_CALL_NO_RWG,
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index 9db01ac2f2..09f5c77254 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -3184,6 +3184,22 @@ DO_ZPZ_FP_D(sve_fcvt_hd, uint64_t, 
float16_to_float64_ieee)
 DO_ZPZ_FP_D(sve_fcvt_ds, uint64_t, float64_to_float32)
 DO_ZPZ_FP_D(sve_fcvt_sd, uint64_t, float32_to_float64)
 
+DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, float16_to_int16_round_to_zero)
+DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, float16_to_int32_round_to_zero)
+DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, float32_to_int32_round_to_zero)
+DO_ZPZ_FP_D(sve_fcvtzs_hd, uint64_t, float16_to_int64_round_to_zero)
+DO_ZPZ_FP_D(sve_fcvtzs_sd, uint64_t, float32_to_int64_round_to_zero)
+DO_ZPZ_FP_D(sve_fcvtzs_ds, uint64_t, float64_to_int32_round_to_zero)
+DO_ZPZ_FP_D(sve_fcvtzs_dd, uint64_t, float64_to_int64_round_to_zero)
+
+DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, float16_to_uint16_round_to_zero)
+DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, float16_to_uint32_round_to_zero)
+DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, float32_to_uint32_round_to_zero)
+DO_ZPZ_FP_D(sve_fcvtzu_hd, uint64_t, float16_to_uint64_round_to_zero)
+DO_ZPZ_FP_D(sve_fcvtzu_sd, uint64_t, float32_to_uint64_round_to_zero)
+DO_ZPZ_FP_D(sve_fcvtzu_ds, uint64_t, float64_to_uint32_round_to_zero)
+DO_ZPZ_FP_D(sve_fcvtzu_dd, uint64_t, float64_to_uint64_round_to_zero)
+
 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index 361d545965..bc865dfd15 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -3681,6 +3681,76 @@ static void trans_FCVT_sd(DisasContext *s, arg_rpr_esz 
*a, uint32_t insn)
 do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_fcvt_sd);
 }
 
+static void trans_FCVTZS_hh(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+do_zpz_ptr(s, a->rd, a->rn, a->pg, true, gen_helper_sve_fcvtzs_hh);
+}
+
+static void trans_FCVTZU_hh(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+do_zpz_ptr(s, a->rd, a->rn, a->pg, true, gen_helper_sve_fcvtzu_hh);
+}
+
+static void trans_FCVTZS_hs(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+do_zpz_ptr(s, a->rd, a->rn, a->pg, true, gen_helper_sve_fcvtzs_hs);
+}
+
+static void trans_FCVTZU_hs(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+do_zpz_ptr(s, a->rd, a->rn, a->pg, true, gen_helper_sve_fcvtzu_hs);
+}
+
+static void trans_FCVTZS_hd(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+do_zpz_ptr(s, a->rd, a->rn, a->pg, true, gen_helper_sve_fcvtzs_hd);
+}
+
+static void trans_FCVTZU_hd(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+do_z

[Qemu-devel] [PATCH v9 10/14] hw/arm/smmuv3: Abort on vfio or vhost case

2018-02-17 Thread Eric Auger
At the moment, the SMMUv3 does not support notification on
TLB invalidation. So let's abort as soon as such notifier gets
enabled.

Signed-off-by: Eric Auger 
---
 hw/arm/smmuv3.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
index 384393f..5efe933 100644
--- a/hw/arm/smmuv3.c
+++ b/hw/arm/smmuv3.c
@@ -1074,12 +1074,23 @@ static void smmuv3_class_init(ObjectClass *klass, void 
*data)
 dc->realize = smmu_realize;
 }
 
+static void smmuv3_notify_flag_changed(IOMMUMemoryRegion *iommu,
+   IOMMUNotifierFlag old,
+   IOMMUNotifierFlag new)
+{
+if (old == IOMMU_NOTIFIER_NONE) {
+error_setg(&error_fatal,
+   "SMMUV3: vhost and vfio notifiers not yet supported");
+}
+}
+
 static void smmuv3_iommu_memory_region_class_init(ObjectClass *klass,
   void *data)
 {
 IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
 
 imrc->translate = smmuv3_translate;
+imrc->notify_flag_changed = smmuv3_notify_flag_changed;
 }
 
 static const TypeInfo smmuv3_type_info = {
-- 
2.5.5




[Qemu-devel] [PATCH v2 55/67] target/arm: Implement SVE gather loads

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h| 67 
 target/arm/sve_helper.c| 75 +++
 target/arm/translate-sve.c | 97 ++
 target/arm/sve.decode  | 53 +
 4 files changed, 292 insertions(+)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index b5c093f2fd..3cb7ab9ef2 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -919,6 +919,73 @@ DEF_HELPER_FLAGS_4(sve_st1hd_r, TCG_CALL_NO_WG, void, env, 
ptr, tl, i32)
 
 DEF_HELPER_FLAGS_4(sve_st1sd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
 
+DEF_HELPER_FLAGS_6(sve_ldbsu_zsu, TCG_CALL_NO_WG,
+   void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldhsu_zsu, TCG_CALL_NO_WG,
+   void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldssu_zsu, TCG_CALL_NO_WG,
+   void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldbss_zsu, TCG_CALL_NO_WG,
+   void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldhss_zsu, TCG_CALL_NO_WG,
+   void, env, ptr, ptr, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_6(sve_ldbsu_zss, TCG_CALL_NO_WG,
+   void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldhsu_zss, TCG_CALL_NO_WG,
+   void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldssu_zss, TCG_CALL_NO_WG,
+   void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldbss_zss, TCG_CALL_NO_WG,
+   void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldhss_zss, TCG_CALL_NO_WG,
+   void, env, ptr, ptr, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_6(sve_ldbdu_zsu, TCG_CALL_NO_WG,
+   void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldhdu_zsu, TCG_CALL_NO_WG,
+   void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldsdu_zsu, TCG_CALL_NO_WG,
+   void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldddu_zsu, TCG_CALL_NO_WG,
+   void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldbds_zsu, TCG_CALL_NO_WG,
+   void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldhds_zsu, TCG_CALL_NO_WG,
+   void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldsds_zsu, TCG_CALL_NO_WG,
+   void, env, ptr, ptr, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_6(sve_ldbdu_zss, TCG_CALL_NO_WG,
+   void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldhdu_zss, TCG_CALL_NO_WG,
+   void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldsdu_zss, TCG_CALL_NO_WG,
+   void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldddu_zss, TCG_CALL_NO_WG,
+   void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldbds_zss, TCG_CALL_NO_WG,
+   void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldhds_zss, TCG_CALL_NO_WG,
+   void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldsds_zss, TCG_CALL_NO_WG,
+   void, env, ptr, ptr, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_6(sve_ldbdu_zd, TCG_CALL_NO_WG,
+   void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldhdu_zd, TCG_CALL_NO_WG,
+   void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldsdu_zd, TCG_CALL_NO_WG,
+   void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldddu_zd, TCG_CALL_NO_WG,
+   void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldbds_zd, TCG_CALL_NO_WG,
+   void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldhds_zd, TCG_CALL_NO_WG,
+   void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldsds_zd, TCG_CALL_NO_WG,
+   void, env, ptr, ptr, ptr, tl, i32)
+
 DEF_HELPER_FLAGS_6(sve_stbs_zsu, TCG_CALL_NO_WG,
void, env, ptr, ptr, ptr, tl, i32)
 DEF_HELPER_FLAGS_6(sve_sths_zsu, TCG_CALL_NO_WG,
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index 07b3d285f2..4edd3d4367 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -3546,6 +3546,81 @@ void HELPER(sve_st4dd_r)(CPUARMState *env, void *vg,
 }
 }
 
+/* Loads with a vector index.  */
+
+#define DO_LD1_ZPZ_S(NAME, TYPEI, TYPEM, FN)\
+void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm,   \
+  target_ulong base, uint32_t desc) \
+{   \
+intptr_t i, oprsz = simd_oprsz(desc) / 8;   \
+unsigned scale = simd_data(desc);   \
+uintptr_t ra = GETPC(); \
+uint32_t *d = vd; TYPEI *m = vm; uint8_t *pg = vg;  

[Qemu-devel] [PATCH v2 66/67] target/arm: Implement SVE floating-point round to integral value

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h| 14 
 target/arm/sve_helper.c|  8 +
 target/arm/translate-sve.c | 80 ++
 target/arm/sve.decode  |  9 ++
 4 files changed, 111 insertions(+)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index 0f5fea9045..749bab0b38 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -985,6 +985,20 @@ DEF_HELPER_FLAGS_5(sve_fcvtzu_sd, TCG_CALL_NO_RWG,
 DEF_HELPER_FLAGS_5(sve_fcvtzu_dd, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_5(sve_frint_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_frint_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_frint_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve_frintx_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_frintx_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_frintx_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
 DEF_HELPER_FLAGS_5(sve_scvt_hh, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_5(sve_scvt_sh, TCG_CALL_NO_RWG,
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index 09f5c77254..7950710be7 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -3200,6 +3200,14 @@ DO_ZPZ_FP_D(sve_fcvtzu_sd, uint64_t, 
float32_to_uint64_round_to_zero)
 DO_ZPZ_FP_D(sve_fcvtzu_ds, uint64_t, float64_to_uint32_round_to_zero)
 DO_ZPZ_FP_D(sve_fcvtzu_dd, uint64_t, float64_to_uint64_round_to_zero)
 
+DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
+DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
+DO_ZPZ_FP_D(sve_frint_d, uint64_t, helper_rintd)
+
+DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
+DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
+DO_ZPZ_FP_D(sve_frintx_d, uint64_t, float64_round_to_int)
+
 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index bc865dfd15..5f1c4984b8 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -3751,6 +3751,86 @@ static void trans_FCVTZU_dd(DisasContext *s, arg_rpr_esz 
*a, uint32_t insn)
 do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_fcvtzu_dd);
 }
 
+static gen_helper_gvec_3_ptr * const frint_fns[3] = {
+gen_helper_sve_frint_h,
+gen_helper_sve_frint_s,
+gen_helper_sve_frint_d
+};
+
+static void trans_FRINTI(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+if (a->esz == 0) {
+unallocated_encoding(s);
+} else {
+do_zpz_ptr(s, a->rd, a->rn, a->pg, a->esz == MO_16,
+   frint_fns[a->esz - 1]);
+}
+}
+
+static void trans_FRINTX(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+static gen_helper_gvec_3_ptr * const fns[3] = {
+gen_helper_sve_frintx_h,
+gen_helper_sve_frintx_s,
+gen_helper_sve_frintx_d
+};
+if (a->esz == 0) {
+unallocated_encoding(s);
+} else {
+do_zpz_ptr(s, a->rd, a->rn, a->pg, a->esz == MO_16, fns[a->esz - 1]);
+}
+}
+
+static void do_frint_mode(DisasContext *s, arg_rpr_esz *a, int mode)
+{
+unsigned vsz = vec_full_reg_size(s);
+TCGv_i32 tmode;
+TCGv_ptr status;
+
+if (a->esz == 0) {
+unallocated_encoding(s);
+return;
+}
+
+tmode = tcg_const_i32(mode);
+status = get_fpstatus_ptr(a->esz == MO_16);
+gen_helper_set_rmode(tmode, tmode, status);
+
+tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, a->rd),
+   vec_full_reg_offset(s, a->rn),
+   pred_full_reg_offset(s, a->pg),
+   status, vsz, vsz, 0, frint_fns[a->esz - 1]);
+
+gen_helper_set_rmode(tmode, tmode, status);
+tcg_temp_free_i32(tmode);
+tcg_temp_free_ptr(status);
+}
+
+static void trans_FRINTN(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+do_frint_mode(s, a, float_round_nearest_even);
+}
+
+static void trans_FRINTP(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+do_frint_mode(s, a, float_round_up);
+}
+
+static void trans_FRINTM(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+do_frint_mode(s, a, float_round_down);
+}
+
+static void trans_FRINTZ(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+do_frint_mode(s, a, float_round_to_zero);
+}
+
+static void trans_FRINTA(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+do_frint_mode(s, a, float_round_ties_away);
+}
+
 static void trans_SCVTF_hh(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
 {
 do_zpz_ptr(s, a->rd, a->rn, a->pg, true, gen_helper_sve_scvt_hh);
diff -

[Qemu-devel] [PATCH v9 08/14] hw/arm/smmuv3: Event queue recording helper

2018-02-17 Thread Eric Auger
Let's introduce a helper function aiming at recording an
event in the event queue.

Signed-off-by: Eric Auger 

---

v8 -> v9:
- add SMMU_EVENT_STRING

v7 -> v8:
- use dma_addr_t instead of hwaddr in smmuv3_record_event()
- introduce struct SMMUEventInfo
- add event_stringify + helpers for all fields
---
 hw/arm/smmuv3-internal.h | 140 ++-
 hw/arm/smmuv3.c  |  91 +-
 hw/arm/trace-events  |   1 +
 3 files changed, 229 insertions(+), 3 deletions(-)

diff --git a/hw/arm/smmuv3-internal.h b/hw/arm/smmuv3-internal.h
index 5af97ae..3929f69 100644
--- a/hw/arm/smmuv3-internal.h
+++ b/hw/arm/smmuv3-internal.h
@@ -226,8 +226,6 @@ static inline void smmu_write_cmdq_err(SMMUv3State *s, 
uint32_t err_type)
 s->cmdq.cons = FIELD_DP32(s->cmdq.cons, CMDQ_CONS, ERR, err_type);
 }
 
-void smmuv3_write_eventq(SMMUv3State *s, Evt *evt);
-
 /* Commands */
 
 enum {
@@ -326,4 +324,142 @@ enum { /* Command completion notification */
 addr; \
 })
 
+/* Events */
+
+typedef enum SMMUEventType {
+SMMU_EVT_OK = 0x00,
+SMMU_EVT_F_UUT  = 0x01,
+SMMU_EVT_C_BAD_STREAMID = 0x02,
+SMMU_EVT_F_STE_FETCH= 0x03,
+SMMU_EVT_C_BAD_STE  = 0x04,
+SMMU_EVT_F_BAD_ATS_TREQ = 0x05,
+SMMU_EVT_F_STREAM_DISABLED  = 0x06,
+SMMU_EVT_F_TRANS_FORBIDDEN  = 0x07,
+SMMU_EVT_C_BAD_SUBSTREAMID  = 0x08,
+SMMU_EVT_F_CD_FETCH = 0x09,
+SMMU_EVT_C_BAD_CD   = 0x0a,
+SMMU_EVT_F_WALK_EABT= 0x0b,
+SMMU_EVT_F_TRANSLATION  = 0x10,
+SMMU_EVT_F_ADDR_SIZE= 0x11,
+SMMU_EVT_F_ACCESS   = 0x12,
+SMMU_EVT_F_PERMISSION   = 0x13,
+SMMU_EVT_F_TLB_CONFLICT = 0x20,
+SMMU_EVT_F_CFG_CONFLICT = 0x21,
+SMMU_EVT_E_PAGE_REQ = 0x24,
+} SMMUEventType;
+
+static const char *event_stringify[] = {
+[SMMU_EVT_OK]   = "SMMU_EVT_OK",
+[SMMU_EVT_F_UUT]= "SMMU_EVT_F_UUT",
+[SMMU_EVT_C_BAD_STREAMID]   = "SMMU_EVT_C_BAD_STREAMID",
+[SMMU_EVT_F_STE_FETCH]  = "SMMU_EVT_F_STE_FETCH",
+[SMMU_EVT_C_BAD_STE]= "SMMU_EVT_C_BAD_STE",
+[SMMU_EVT_F_BAD_ATS_TREQ]   = "SMMU_EVT_F_BAD_ATS_TREQ",
+[SMMU_EVT_F_STREAM_DISABLED]= "SMMU_EVT_F_STREAM_DISABLED",
+[SMMU_EVT_F_TRANS_FORBIDDEN]= "SMMU_EVT_F_TRANS_FORBIDDEN",
+[SMMU_EVT_C_BAD_SUBSTREAMID]= "SMMU_EVT_C_BAD_SUBSTREAMID",
+[SMMU_EVT_F_CD_FETCH]   = "SMMU_EVT_F_CD_FETCH",
+[SMMU_EVT_C_BAD_CD] = "SMMU_EVT_C_BAD_CD",
+[SMMU_EVT_F_WALK_EABT]  = "SMMU_EVT_F_WALK_EABT",
+[SMMU_EVT_F_TRANSLATION]= "SMMU_EVT_F_TRANSLATION",
+[SMMU_EVT_F_ADDR_SIZE]  = "SMMU_EVT_F_ADDR_SIZE",
+[SMMU_EVT_F_ACCESS] = "SMMU_EVT_F_ACCESS",
+[SMMU_EVT_F_PERMISSION] = "SMMU_EVT_F_PERMISSION",
+[SMMU_EVT_F_TLB_CONFLICT]   = "SMMU_EVT_F_TLB_CONFLICT",
+[SMMU_EVT_F_CFG_CONFLICT]   = "SMMU_EVT_F_CFG_CONFLICT",
+[SMMU_EVT_E_PAGE_REQ]   = "SMMU_EVT_E_PAGE_REQ",
+};
+
+#define SMMU_EVENT_STRING(event) ( \
+(event < ARRAY_SIZE(event_stringify)) ? event_stringify[event] : "UNKNOWN" \
+)
+
+typedef struct SMMUEventInfo {
+SMMUEventType type;
+uint32_t sid;
+bool recorded;
+bool record_trans_faults;
+union {
+struct {
+uint32_t ssid;
+bool ssv;
+dma_addr_t addr;
+bool rnw;
+bool pnu;
+bool ind;
+   } f_uut;
+   struct ssid_info {
+uint32_t ssid;
+bool ssv;
+   } c_bad_streamid;
+   struct ssid_addr_info {
+uint32_t ssid;
+bool ssv;
+dma_addr_t addr;
+   } f_ste_fetch;
+   struct ssid_info c_bad_ste;
+   struct {
+dma_addr_t addr;
+bool rnw;
+   } f_transl_forbidden;
+   struct {
+uint32_t ssid;
+   } c_bad_substream;
+   struct ssid_addr_info f_cd_fetch;
+   struct ssid_info c_bad_cd;
+   struct full_info {
+bool stall;
+uint16_t stag;
+uint32_t ssid;
+bool ssv;
+bool s2;
+dma_addr_t addr;
+bool rnw;
+bool pnu;
+bool ind;
+uint8_t class;
+dma_addr_t addr2;
+   } f_walk_eabt;
+   struct full_info f_translation;
+   struct full_info f_addr_size;
+   struct full_info f_access;
+   struct full_info f_permission;
+   struct ssid_info f_cfg_conflict;
+   /**
+* not supported yet:
+* F_BAD_ATS_TREQ
+* F_BAD_ATS_TREQ
+* F_TLB_CONFLICT
+* E_PAGE_REQUEST
+* IMPDEF_EVENTn
+*/
+} u;
+}

[Qemu-devel] [PATCH 1/2] hw/net: Add support for Intel pch_gbe ethernet

2018-02-17 Thread Paul Burton
This patch introduces support for emulating the ethernet controller
found in the Intel EG20T Platform Controller Hub, referred to as pch_gbe
for consistency with both Linux & U-Boot.

Documentation for the hardware can be found here:

  
https://www.intel.com/content/www/us/en/intelligent-systems/queens-bay/platform-controller-hub-eg20t-datasheet.html

The device is used on MIPS Boston development boards as well as the
Intel Crown Bay platform including devices such as the Minnowboard V1.

Enough functionality is implemented for Linux to make use of the device,
and has been tested using Linux v4.16-rc1.

Signed-off-by: Paul Burton 
Cc: Aurelien Jarno 
Cc: Yongbok Kim 
---

 hw/net/Makefile.objs |   1 +
 hw/net/pch_gbe.c | 766 +++
 2 files changed, 767 insertions(+)
 create mode 100644 hw/net/pch_gbe.c

diff --git a/hw/net/Makefile.objs b/hw/net/Makefile.objs
index ab22968641..08706d9a96 100644
--- a/hw/net/Makefile.objs
+++ b/hw/net/Makefile.objs
@@ -12,6 +12,7 @@ common-obj-$(CONFIG_E1000E_PCI) += e1000e.o e1000e_core.o 
e1000x_common.o
 common-obj-$(CONFIG_RTL8139_PCI) += rtl8139.o
 common-obj-$(CONFIG_VMXNET3_PCI) += net_tx_pkt.o net_rx_pkt.o
 common-obj-$(CONFIG_VMXNET3_PCI) += vmxnet3.o
+common-obj-$(CONFIG_PCH_GBE_PCI) += pch_gbe.o
 
 common-obj-$(CONFIG_SMC91C111) += smc91c111.o
 common-obj-$(CONFIG_LAN9118) += lan9118.o
diff --git a/hw/net/pch_gbe.c b/hw/net/pch_gbe.c
new file mode 100644
index 00..be9a9f5916
--- /dev/null
+++ b/hw/net/pch_gbe.c
@@ -0,0 +1,766 @@
+#include "qemu/osdep.h"
+#include "hw/hw.h"
+#include "hw/net/mii.h"
+#include "hw/pci/pci.h"
+#include "net/checksum.h"
+#include "net/eth.h"
+#include "net/net.h"
+#include "qemu/bitops.h"
+#include "qemu/log.h"
+
+#define TYPE_PCH_GBE"pch_gbe"
+#define PCH_GBE(obj)OBJECT_CHECK(PCHGBEState, (obj), TYPE_PCH_GBE)
+
+#define PCH_GBE_INTR_RX_DMA_CMPLT   BIT(0)
+#define PCH_GBE_INTR_RX_VALID   BIT(1)
+#define PCH_GBE_INTR_RX_FRAME_ERR   BIT(2)
+#define PCH_GBE_INTR_RX_FIFO_ERRBIT(3)
+#define PCH_GBE_INTR_RX_DMA_ERR BIT(4)
+#define PCH_GBE_INTR_RX_DSC_EMP BIT(5)
+#define PCH_GBE_INTR_TX_CMPLT   BIT(8)
+#define PCH_GBE_INTR_TX_DMA_CMPLT   BIT(9)
+#define PCH_GBE_INTR_TX_FIFO_ERRBIT(10)
+#define PCH_GBE_INTR_TX_DMA_ERR BIT(11)
+#define PCH_GBE_INTR_PAUSE_CMPLTBIT(12)
+#define PCH_GBE_INTR_MIIM_CMPLT BIT(16)
+#define PCH_GBE_INTR_PHY_INTBIT(20)
+#define PCH_GBE_INTR_WOL_DETBIT(24)
+#define PCH_GBE_INTR_TCPIP_ERR  BIT(28)
+#define PCH_GBE_INTR_ALL (  \
+PCH_GBE_INTR_RX_DMA_CMPLT | \
+PCH_GBE_INTR_RX_VALID | \
+PCH_GBE_INTR_RX_FRAME_ERR | \
+PCH_GBE_INTR_RX_FIFO_ERR |  \
+PCH_GBE_INTR_RX_DMA_ERR |   \
+PCH_GBE_INTR_RX_DSC_EMP |   \
+PCH_GBE_INTR_TX_CMPLT | \
+PCH_GBE_INTR_TX_DMA_CMPLT | \
+PCH_GBE_INTR_TX_FIFO_ERR |  \
+PCH_GBE_INTR_TX_DMA_ERR |   \
+PCH_GBE_INTR_PAUSE_CMPLT |  \
+PCH_GBE_INTR_MIIM_CMPLT |   \
+PCH_GBE_INTR_PHY_INT |  \
+PCH_GBE_INTR_WOL_DET |  \
+PCH_GBE_INTR_TCPIP_ERR)
+
+struct pch_gbe_tx_desc {
+uint32_t addr;
+
+uint32_t len;
+#define PCH_GBE_TX_LENGTH   0x
+
+uint32_t control;
+#define PCH_GBE_TX_CONTROL_EOB  0x3
+#define PCH_GBE_TX_CONTROL_WORDS0xfffc
+#define PCH_GBE_TX_CONTROL_APAD BIT(16)
+#define PCH_GBE_TX_CONTROL_ICRC BIT(17)
+#define PCH_GBE_TX_CONTROL_ITAG BIT(18)
+#define PCH_GBE_TX_CONTROL_ACCOFF   BIT(19)
+
+uint32_t status;
+#define PCH_GBE_TX_STATUS_TSHRT BIT(22)
+#define PCH_GBE_TX_STATUS_TLNG  BIT(23)
+#define PCH_GBE_TX_STATUS_ABT   BIT(28)
+#define PCH_GBE_TX_STATUS_CMPLT BIT(29)
+};
+
+struct pch_gbe_rx_desc {
+uint32_t addr;
+
+uint32_t acc_status;
+
+uint32_t mac_status;
+#define PCH_GBE_RX_MAC_STATUS_EOB   0x3
+#define PCH_GBE_RX_MAC_STATUS_WORDS 0xfffc
+#define PCH_GBE_RX_MAC_STATUS_LENGTH0x
+#define PCH_GBE_RX_MAC_STATUS_TSHRT BIT(19)
+#define PCH_GBE_RX_MAC_STATUS_TLNG  BIT(20)
+
+uint32_t dma_status;
+};
+
+typedef struct {
+/*< private >*/
+PCIDevice parent_obj;
+/*< public >*/
+
+NICState *nic;
+NICConf conf;
+
+bool reset;
+bool phy_reset;
+
+bool link;
+
+uint32_t intr_status;
+uint32_t intr_status_hold;
+uint32_t intr_enable;
+
+uint16_t addr_mask;
+
+bool rx_enable;
+bool rx_dma_enable;
+bool rx_acc_enable;
+bool rx_acc_csum_off;
+uint32_t rx_desc_base;
+uint32_t rx_desc_size;
+uint32_t rx_desc_hard_ptr;
+uint32_t rx_desc_hard_ptr_hold;
+uint32_t rx_desc_soft_ptr;
+
+bool tx_dma_enable;
+bool tx_acc_enable;
+uint32_t tx_desc_base;
+uint32_t tx_desc_size;
+uint32_t t

[Qemu-devel] [PATCH v2 56/67] target/arm: Implement SVE scatter store vector immediate

2018-02-17 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/translate-sve.c | 79 +++---
 target/arm/sve.decode  | 11 +++
 2 files changed, 65 insertions(+), 25 deletions(-)

diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index 6484ecd257..0241e8e707 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -4011,31 +4011,33 @@ static void trans_LD1_zpiz(DisasContext *s, 
arg_LD1_zpiz *a, uint32_t insn)
 tcg_temp_free_i64(imm);
 }
 
+/* Indexed by [xs][msz].  */
+static gen_helper_gvec_mem_scatter * const scatter_store_fn32[2][3] = {
+{ gen_helper_sve_stbs_zsu,
+  gen_helper_sve_sths_zsu,
+  gen_helper_sve_stss_zsu, },
+{ gen_helper_sve_stbs_zss,
+  gen_helper_sve_sths_zss,
+  gen_helper_sve_stss_zss, },
+};
+
+static gen_helper_gvec_mem_scatter * const scatter_store_fn64[3][4] = {
+{ gen_helper_sve_stbd_zsu,
+  gen_helper_sve_sthd_zsu,
+  gen_helper_sve_stsd_zsu,
+  gen_helper_sve_stdd_zsu, },
+{ gen_helper_sve_stbd_zss,
+  gen_helper_sve_sthd_zss,
+  gen_helper_sve_stsd_zss,
+  gen_helper_sve_stdd_zss, },
+{ gen_helper_sve_stbd_zd,
+  gen_helper_sve_sthd_zd,
+  gen_helper_sve_stsd_zd,
+  gen_helper_sve_stdd_zd, },
+};
+
 static void trans_ST1_zprz(DisasContext *s, arg_ST1_zprz *a, uint32_t insn)
 {
-/* Indexed by [xs][msz].  */
-static gen_helper_gvec_mem_scatter * const fn32[2][3] = {
-{ gen_helper_sve_stbs_zsu,
-  gen_helper_sve_sths_zsu,
-  gen_helper_sve_stss_zsu, },
-{ gen_helper_sve_stbs_zss,
-  gen_helper_sve_sths_zss,
-  gen_helper_sve_stss_zss, },
-};
-static gen_helper_gvec_mem_scatter * const fn64[3][4] = {
-{ gen_helper_sve_stbd_zsu,
-  gen_helper_sve_sthd_zsu,
-  gen_helper_sve_stsd_zsu,
-  gen_helper_sve_stdd_zsu, },
-{ gen_helper_sve_stbd_zss,
-  gen_helper_sve_sthd_zss,
-  gen_helper_sve_stsd_zss,
-  gen_helper_sve_stdd_zss, },
-{ gen_helper_sve_stbd_zd,
-  gen_helper_sve_sthd_zd,
-  gen_helper_sve_stsd_zd,
-  gen_helper_sve_stdd_zd, },
-};
 gen_helper_gvec_mem_scatter *fn;
 
 if (a->esz < a->msz || (a->msz == 0 && a->scale)) {
@@ -4044,10 +4046,10 @@ static void trans_ST1_zprz(DisasContext *s, 
arg_ST1_zprz *a, uint32_t insn)
 }
 switch (a->esz) {
 case MO_32:
-fn = fn32[a->xs][a->msz];
+fn = scatter_store_fn32[a->xs][a->msz];
 break;
 case MO_64:
-fn = fn64[a->xs][a->msz];
+fn = scatter_store_fn64[a->xs][a->msz];
 break;
 default:
 g_assert_not_reached();
@@ -4056,6 +4058,33 @@ static void trans_ST1_zprz(DisasContext *s, arg_ST1_zprz 
*a, uint32_t insn)
cpu_reg_sp(s, a->rn), fn);
 }
 
+static void trans_ST1_zpiz(DisasContext *s, arg_ST1_zpiz *a, uint32_t insn)
+{
+gen_helper_gvec_mem_scatter *fn = NULL;
+TCGv_i64 imm;
+
+if (a->esz < a->msz) {
+unallocated_encoding(s);
+return;
+}
+
+switch (a->esz) {
+case MO_32:
+fn = scatter_store_fn32[0][a->msz];
+break;
+case MO_64:
+fn = scatter_store_fn64[2][a->msz];
+break;
+}
+assert(fn != NULL);
+
+/* Treat ST1_zpiz (zn[x] + imm) the same way as ST1_zprz (rn + zm[x])
+   by loading the immediate into the scalar parameter.  */
+imm = tcg_const_i64(a->imm << a->msz);
+do_mem_zpz(s, a->rd, a->pg, a->rn, 0, imm, fn);
+tcg_temp_free_i64(imm);
+}
+
 /*
  * Prefetches
  */
diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index f85d82e009..6ccb4289fc 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -84,6 +84,7 @@
 &rprr_gather_load  rd pg rn rm esz msz u ff xs scale
 &rpri_gather_load  rd pg rn imm esz msz u ff
 &rprr_scatter_storerd pg rn rm esz msz xs scale
+&rpri_scatter_storerd pg rn imm esz msz
 
 ###
 # Named instruction formats.  These are generally used to
@@ -216,6 +217,8 @@
&rprr_store nreg=0
 @rprr_scatter_store ... msz:2 .. rm:5 ... pg:3 rn:5 rd:5 \
&rprr_scatter_store
+@rpri_scatter_store ... msz:2 ..imm:5 ... pg:3 rn:5 rd:5 \
+   &rpri_scatter_store
 
 ###
 # Instruction patterns.  Grouped according to the SVE encodingindex.xhtml.
@@ -935,6 +938,14 @@ ST1_zprz   1110010 .. 01 . 101 ... . . \
 ST1_zprz   1110010 .. 00 . 101 ... . . \
@rprr_scatter_store xs=2 esz=3 scale=0
 
+# SVE 64-bit scatter store (vector plus immediate)
+ST1_zpiz   1110010 .. 10 . 101 ... . . \
+   @rpri_scatter_store esz=3
+
+# SVE 32-bit scatter store (vector plus immediate)
+ST1_zpiz   1110010 .. 11 . 101 ... . .

  1   2   >