Re: [PATCH] apps: trivial (and non-working) fpc example application

2016-08-25 Thread Benoît CANET
Works for me (tm)

Reviewed-by: Benoît Canet 

2016-08-25 18:46 GMT+02:00 Nadav Har'El :

> This is a trivial hello-world Free Pascal application.
> It does't currently work because of missing syscall instruction support.
> https://github.com/cloudius-systems/osv/issues/590
> Hopefully it will begin working when OSv adds syscall instruction support.
>
> Stack trace we get in fpc initialization function:
>
>   4  0x00482754 in invalid_opcode (ef=0x82a2b088)
>  at arch/x64/exceptions.cc:320
>   5  
>   6  0x10c07b72 in SYSTEM_$$_FPSYSCALL$INT64$INT64$INT64$$INT64 ()
>   7  0x10c0839a in SYSTEM_$$_FPGETRLIMIT$LONGINT$PRLIMIT$$LONGINT
> ()
>   8  0x000c000ff3b0 in ?? ()
>   9  0x10c2a357 in SYSTEM_$$_CHECKINITIALSTKLEN$QWORD$$QWORD ()
>   10 0x in ?? ()
>
> Note that to see these function names, you'll need to remove the strip
> command from the script ppash.sh automatically generated by fpc, and run
> it again.
>
> This example was based on
> http://wiki.lazarus.freepascal.org/Free_Pascal_on_OSv
> but I wanted to not need the special linking instructions.
>
> Signed-off-by: Nadav Har'El 
> ---
>  fpc-example/module.py |  3 +++
>  fpc-example/Makefile  | 11 +++
>  fpc-example/README|  1 +
>  fpc-example/fpc-hello.pas | 20 
>  fpc-example/usr.manifest  |  1 +
>  5 files changed, 36 insertions(+)
>  create mode 100644 fpc-example/module.py
>  create mode 100644 fpc-example/Makefile
>  create mode 100644 fpc-example/README
>  create mode 100644 fpc-example/fpc-hello.pas
>  create mode 100644 fpc-example/usr.manifest
>
> diff --git a/fpc-example/module.py b/fpc-example/module.py
> new file mode 100644
> index 000..cc4b4bd
> --- /dev/null
> +++ b/fpc-example/module.py
> @@ -0,0 +1,3 @@
> +from osv.modules import api
> +
> +default = api.run("/fpc-hello")
> diff --git a/fpc-example/Makefile b/fpc-example/Makefile
> new file mode 100644
> index 000..3733869
> --- /dev/null
> +++ b/fpc-example/Makefile
> @@ -0,0 +1,11 @@
> +.PHONY: module
> +module: fpc-hello
> +
> +
> +fpc-hello: fpc-hello.pas
> +   fpc -fPIC -XD -Xc -s -ofpc-hello fpc-hello.pas
> +   # if we drop -init and -fini from link line, it works. Like this
> it doesn't, currently.
> +   sh ppas.sh
> +
> +clean:
> +   rm -f fpc-hello fpc-hello.o link.res ppas.sh
> diff --git a/fpc-example/README b/fpc-example/README
> new file mode 100644
> index 000..8eb493d
> --- /dev/null
> +++ b/fpc-example/README
> @@ -0,0 +1 @@
> +Doesn't work - check out https://github.com/cloudius-
> systems/osv/issues/590
> diff --git a/fpc-example/fpc-hello.pas b/fpc-example/fpc-hello.pas
> new file mode 100644
> index 000..3659f42
> --- /dev/null
> +++ b/fpc-example/fpc-hello.pas
> @@ -0,0 +1,20 @@
> +library hello;
> +
> +uses
> +  unixtype;
> +
> +// use the C function 'write'
> +function CWrite(fd : cInt; buf:pChar; nbytes : unixtype.TSize): TSsize;
> external name 'write';
> +
> +// start function for OSv
> +function main: longint; cdecl;
> +const
> +  MyText: PChar = 'It works!';
> +begin
> +  CWrite(StdOutputHandle,MyText,strlen(MyText));
> +  main:=0;
> +end;
> +
> +exports main name 'main'; // OSv searches for 'main' in the library
> +
> +end.
> diff --git a/fpc-example/usr.manifest b/fpc-example/usr.manifest
> new file mode 100644
> index 000..c3c1304
> --- /dev/null
> +++ b/fpc-example/usr.manifest
> @@ -0,0 +1 @@
> +/fpc-hello: ${MODULE_DIR}/fpc-hello
> --
> 2.7.4
>
> --
> You received this message because you are subscribed to the Google Groups
> "OSv Development" group.
> To unsubscribe from this group and stop receiving emails from it, send an
> email to osv-dev+unsubscr...@googlegroups.com.
> For more options, visit https://groups.google.com/d/optout.
>

-- 
You received this message because you are subscribed to the Google Groups "OSv 
Development" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to osv-dev+unsubscr...@googlegroups.com.
For more options, visit https://groups.google.com/d/optout.


[PATCH] apps: trivial (and non-working) fpc example application

2016-08-25 Thread Nadav Har'El
This is a trivial hello-world Free Pascal application.
It does't currently work because of missing syscall instruction support.
https://github.com/cloudius-systems/osv/issues/590
Hopefully it will begin working when OSv adds syscall instruction support.

Stack trace we get in fpc initialization function:

  4  0x00482754 in invalid_opcode (ef=0x82a2b088)
 at arch/x64/exceptions.cc:320
  5  
  6  0x10c07b72 in SYSTEM_$$_FPSYSCALL$INT64$INT64$INT64$$INT64 ()
  7  0x10c0839a in SYSTEM_$$_FPGETRLIMIT$LONGINT$PRLIMIT$$LONGINT ()
  8  0x000c000ff3b0 in ?? ()
  9  0x10c2a357 in SYSTEM_$$_CHECKINITIALSTKLEN$QWORD$$QWORD ()
  10 0x in ?? ()

Note that to see these function names, you'll need to remove the strip
command from the script ppash.sh automatically generated by fpc, and run
it again.

This example was based on
http://wiki.lazarus.freepascal.org/Free_Pascal_on_OSv
but I wanted to not need the special linking instructions.

Signed-off-by: Nadav Har'El 
---
 fpc-example/module.py |  3 +++
 fpc-example/Makefile  | 11 +++
 fpc-example/README|  1 +
 fpc-example/fpc-hello.pas | 20 
 fpc-example/usr.manifest  |  1 +
 5 files changed, 36 insertions(+)
 create mode 100644 fpc-example/module.py
 create mode 100644 fpc-example/Makefile
 create mode 100644 fpc-example/README
 create mode 100644 fpc-example/fpc-hello.pas
 create mode 100644 fpc-example/usr.manifest

diff --git a/fpc-example/module.py b/fpc-example/module.py
new file mode 100644
index 000..cc4b4bd
--- /dev/null
+++ b/fpc-example/module.py
@@ -0,0 +1,3 @@
+from osv.modules import api
+
+default = api.run("/fpc-hello")
diff --git a/fpc-example/Makefile b/fpc-example/Makefile
new file mode 100644
index 000..3733869
--- /dev/null
+++ b/fpc-example/Makefile
@@ -0,0 +1,11 @@
+.PHONY: module
+module: fpc-hello
+
+
+fpc-hello: fpc-hello.pas
+   fpc -fPIC -XD -Xc -s -ofpc-hello fpc-hello.pas
+   # if we drop -init and -fini from link line, it works. Like this it 
doesn't, currently.
+   sh ppas.sh
+
+clean:
+   rm -f fpc-hello fpc-hello.o link.res ppas.sh
diff --git a/fpc-example/README b/fpc-example/README
new file mode 100644
index 000..8eb493d
--- /dev/null
+++ b/fpc-example/README
@@ -0,0 +1 @@
+Doesn't work - check out https://github.com/cloudius-systems/osv/issues/590
diff --git a/fpc-example/fpc-hello.pas b/fpc-example/fpc-hello.pas
new file mode 100644
index 000..3659f42
--- /dev/null
+++ b/fpc-example/fpc-hello.pas
@@ -0,0 +1,20 @@
+library hello;
+ 
+uses
+  unixtype;
+ 
+// use the C function 'write'
+function CWrite(fd : cInt; buf:pChar; nbytes : unixtype.TSize): TSsize;  
external name 'write';
+ 
+// start function for OSv
+function main: longint; cdecl;
+const
+  MyText: PChar = 'It works!';
+begin
+  CWrite(StdOutputHandle,MyText,strlen(MyText));
+  main:=0;
+end;
+ 
+exports main name 'main'; // OSv searches for 'main' in the library
+ 
+end.
diff --git a/fpc-example/usr.manifest b/fpc-example/usr.manifest
new file mode 100644
index 000..c3c1304
--- /dev/null
+++ b/fpc-example/usr.manifest
@@ -0,0 +1 @@
+/fpc-hello: ${MODULE_DIR}/fpc-hello
-- 
2.7.4

-- 
You received this message because you are subscribed to the Google Groups "OSv 
Development" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to osv-dev+unsubscr...@googlegroups.com.
For more options, visit https://groups.google.com/d/optout.


Re: [PATCH 1/2] x64: 'syscall' instruction support

2016-08-25 Thread Nadav Har'El
Hi Benoit (and Pekka), thanks. Here are some comments and questions:


--
Nadav Har'El
n...@scylladb.com

On Thu, Aug 25, 2016 at 1:07 PM, Benoit Canet <
benoit.canet.cont...@gmail.com> wrote:

> Enable "fast system calls" via the 'syscall' instruction on OSv. The
> instruction is used by Go programs on Linux/x86-64 for system calls.
>
> Signed-off-by: Pekka Enberg 
> Signed-off-by: Benoît Canet 
> ---
>  arch/x64/arch-setup.cc | 12 
>  arch/x64/entry.S   | 20 
>  arch/x64/msr.hh|  3 +++
>  linux.cc   |  9 +
>  4 files changed, 44 insertions(+)
>
> diff --git a/arch/x64/arch-setup.cc b/arch/x64/arch-setup.cc
> index 5e76d82..520651d 100644
> --- a/arch/x64/arch-setup.cc
> +++ b/arch/x64/arch-setup.cc
> @@ -10,6 +10,7 @@
>  #include 
>  #include 
>  #include "processor.hh"
> +#include "processor-flags.h"
>  #include "msr.hh"
>  #include "xen.hh"
>  #include 
> @@ -213,6 +214,16 @@ static inline void disable_pic()
>  XENPV_ALTERNATIVE({ processor::outb(0xff, 0x21);
> processor::outb(0xff, 0xa1); }, {});
>  }
>
> +extern "C" void syscall_entry(void);
> +
> +static void setup_syscall()
> +{
> +processor::wrmsr(msr::IA32_STAR,  static_cast(1*8) << 32);
>

What is this calculation "1*8" supposed to mean? can it get a name?


> +processor::wrmsr(msr::IA32_LSTAR, reinterpret_cast(
> syscall_entry));
> +processor::wrmsr(msr::IA32_FMASK, 0);
>

I'm not an expert in this stuff enough to understand what FMASK was
supposed to solve, and why 0 is good enough for us.


> +processor::wrmsr(msr::IA32_EFER,  processor::rdmsr(msr::IA32_EFER) |
> 0x01);
>

please define a name - e.g., IA32_EFER_SCE or msr_bits::IA32_EFER::SCE or
something, for this "0x1" (SCE = "system call extension").


> +}
> +
>  void arch_init_premain()
>  {
>  auto omb = *osv_multiboot_info;
> @@ -220,6 +231,7 @@ void arch_init_premain()
> debug_early_u64("Error reading disk (real mode): ",
> static_cast(omb.disk_err));
>
>  disable_pic();
> +setup_syscall();
>  }
>
>  #include "drivers/driver.hh"
> diff --git a/arch/x64/entry.S b/arch/x64/entry.S
> index b6f5abe..d3a864a 100644
> --- a/arch/x64/entry.S
> +++ b/arch/x64/entry.S
> @@ -159,3 +159,23 @@ call_signal_handler_thunk:
>  iretq
>  .cfi_endproc
>
> +.global syscall_entry
> +syscall_entry:
> +   # There is no ring transition and rflags are left unchanged. The
> only
> +   # thing we need to save is the rip which is stored in rcx by the
> syscall
> +   # instruction.
> +   push %rcx
> +   # FIXME: registers clobbered?

+   # FIXME: FPU state?
>

Please see Glauber's comments about these issues  in
https://groups.google.com/forum/#!msg/osv-dev/PW3bkaVCuMg/-bePROMbWWEJ
If you think Glauber's comments are wrong, please remove the FIXMEs. If
they are right, we better fix those FIXMEs and not commit
something which doesn't actually work.

I think the need to save FPU state is unlikely (because there will only be
dirty FPU state if the function calling the SYSCALL instruction is using
the FPU) but may be necessary for completeness. But it will be really
inefficient :-( Linux has it easier here, because it doesn't use FPU in the
kernel...


+   # FIXME: system call arguments?
> +   movq %r10, %rcx
> +   # rotate syscall arguments
> +   movq %r8,  %r9
> +   movq %rcx, %r8
> +   movq %rdx, %rcx
> +   movq %rsi, %rdx
> +   movq %rdi, %rsi
> +   movq %rax, %rdi
>

I wonder how this code behaves with gdb if you try to "backtrace" a crash
inside a system call. Maybe we need all sorts of ".cfi" pseudo-instructions
to solve this? But we can attend to this issue later (let's just not forget
to return to it).

+   call syscall_wrapper
> +   pop %rcx
> +   jmp *%rcx
>

This looks correct, although I guess the more "traditional" approach would
be to use SYSRET here instead of the pop and the jmp.



> diff --git a/arch/x64/msr.hh b/arch/x64/msr.hh
> index 154bba7..d77c75c 100644
> --- a/arch/x64/msr.hh
> +++ b/arch/x64/msr.hh
> @@ -58,6 +58,9 @@ enum class msr : uint32_t {
>
>  IA32_APIC_BASE = 0x001b,
>  IA32_EFER = 0xc080,
> +IA32_STAR = 0xc081,
> +IA32_LSTAR = 0xc082,
> +IA32_FMASK = 0xc084,
>  IA32_FS_BASE = 0xc100,
>
>  KVM_WALL_CLOCK = 0x11,
> diff --git a/linux.cc b/linux.cc
> index bd82ca9..8a2a4a3 100644
> --- a/linux.cc
> +++ b/linux.cc
> @@ -291,3 +291,12 @@ long syscall(long number, ...)
>  return -1;
>  }
>  long __syscall(long number, ...)  __attribute__((alias("syscall")));
> +
> +extern "C" long syscall_wrapper(long number, ...)
> +{
> +auto ret = syscall(number);
>
+if (ret < 0) {
> +return -errno;
>

I think there's a bug: The syscall instruction itself is not supposed to
modify errno. This code does. I think we need to restore errno here to what
it was before we called syscall().



> +}
> +   

Re: [PATCH 2/2] app: return main thread id to the caller

2016-08-25 Thread Nadav Har'El
I just sent another RFC patch which should cause osv_execve to wait until
app_runtime() is the new application's.
I haven't really tested it much beyond the usual "make check" - can you
please try it out?

Thanks,
Nadav.


--
Nadav Har'El
n...@scylladb.com

On Wed, Aug 24, 2016 at 7:21 AM, Justin Cinkelj 
wrote:

>
>
> On 08/23/2016 11:58 PM, Nadav Har'El wrote:
>
>
> On Tue, Aug 23, 2016 at 3:28 PM, Justin Cinkelj < 
> justin.cink...@xlab.si> wrote:
>
>>
>> @nadav
>> With second RFC patch, osv_execve dosn't return thread_id==0 any more.
>>
>>
>> (code after RFC-2 pathc) The application::start_and_join() line
>> sched::thread::current()->set_app_runtime(runtime()); has to finish, so
>> that with_all_app_threads works as desired. I think that's inline with your
>> explanation. If I add sleep(1) just before it, than problem becomes 100%
>> reproducible (it was 10-20% reproducible before).
>>
>
> Yes, so I think we understand now why the problem happens. Now the
> question is how to fix it.
>
> I have some ideas on how it might be possible to fix it, i.e., delay
> osv_execve()'s return until the new thread got its new app_runtime setting.
>
> However, I started wondering whether we should fix anything in
> osv_execve():
>
> osv_execve() promises to return the new thread id, and that it does (after
> my last patch).
> However, it doesn't promise anything about how far along this thread went:
> Did it load the executable? Start to run it? Did it even set the thread's
> app_runtime? We don't know.
> The question becomes, if we start promising more, why the app_runtime
> thing? Isn't this just one of the arbitrary things that happen during the
> starting of the application - why promise that in particular?
> If osv_execve() returned an app instead of a thread id, things would be
> somewhat different, but even then, there is a question whether it is fine
> for osv_execve() to return an app that no thread yet belongs to - or
> whether we should wait until at least the one thread we created belongs to
> this app.
>
> So this got me thinking: what if we just decide that osv_execve()'s caller
> is not guaranteed the app_runtime was set?
> The caller (your MPI code looking for threads to setaffinity for) would
> just need to loop checking find_by_id(tid)->app_runtime() stopping if
> find_by_id can't find the thread (this means it exited already!) or if
> app_runtime() changed from what sched::thread::current->app_runtime() was.
> Alternatively, if the setaffinity code already works in a loop (since
> threads can be created after startup!), it's even simpler: Just check if
> find_by_id(tid)->app_runtime() is still the same as current->app_runtime()
> - and if it is, just don't do anything (a later iteration of the loop will
> find the threads).
>
> Does this last plan make sense?
>
> Yes. No.
> Yes, that someone has to wait until app_runtime is set. But as user of
> OSv, I don't want to know all the OSv internal details, and how to work
> around them.
> If osv_execve returns before new thread app_runtime is set, than maybe
> osv_get_all_app_threads (from patch I haven't yet sent to list) could do
> the waiting on app_runtime.
> But it would have to check if new_thread.app_runtime is different from
> caller_thread.app_runtime. And that would not work if some thread wants to
> get list of its own thread - caller_thread == new_thread.
> So the wait loop should cover that extra case (not that current Open MPI
> code needs it; it just to be a bit more general).
> Sounds acceptable?
>

-- 
You received this message because you are subscribed to the Google Groups "OSv 
Development" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to osv-dev+unsubscr...@googlegroups.com.
For more options, visit https://groups.google.com/d/optout.


[PATCH] RFC: osv_execve() should wait for app_runtime to be set

2016-08-25 Thread Nadav Har'El
This patch makes osv_execve() wait a little bit longer before returning -
until the thread id it returns is only set, but also this thread has a new
app_runtime set.

I thought of at least a dozen ways to do this, each uglier than the next,
so I ended up with the solution which I considered least ugly, but is still
not pretty... osv_execve() now uses application::run_and_join() directly,
instead of the trivial osv::run() wrapper. The run_and_join() function
received a new parameter - a "waiter" object that it wakes when the
app_runtime is set - and osv_execve() waits on this object.

We use the "waiter" idiom (see wait_record.hh) to ensure that it is ok for
the parent thread (running osv_execve()) to request waiting before or after
the child thread (running run_and_join()) waking it up.

CAVEAT EMPTOR: After osv_execve() returns the returned thread will have the
expected new app_runtime(). However, if the application exits, run_and_join()
restores the previous app_runtime() which becomes visible for a very brief
period until the thread exits. Fixing this will require yet another ugly
workaround, so I wanted to ask first whether it is actually a problem for
our only know use case.

Signed-off-by: Nadav Har'El 
---
 include/osv/app.hh |  7 +--
 core/app.cc| 11 ---
 core/osv_execve.cc | 23 ---
 3 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/include/osv/app.hh b/include/osv/app.hh
index dbdf1da..6fa503a 100644
--- a/include/osv/app.hh
+++ b/include/osv/app.hh
@@ -25,6 +25,8 @@
 extern "C" void __libc_start_main(int(*)(int, char**), int, char**, void(*)(),
 void(*)(), void(*)(), void*);
 
+class waiter;
+
 namespace osv {
 
 class application;
@@ -131,7 +133,8 @@ public:
 static shared_app_t run_and_join(const std::string& command,
 const std::vector& args,
 bool new_program = false,
-const std::unordered_map *env = nullptr);
+const std::unordered_map *env = nullptr,
+waiter* setup_waiter = nullptr);
 
 /**
  * Installs a termination callback which will be called when
@@ -188,7 +191,7 @@ private:
 return shared_from_this();
 }
 void start();
-void start_and_join();
+void start_and_join(waiter* setup_waiter);
 void main();
 void run_main(std::string path, int argc, char** argv);
 void run_main();
diff --git a/core/app.cc b/core/app.cc
index bf6ff68..56fd18b 100644
--- a/core/app.cc
+++ b/core/app.cc
@@ -16,6 +16,7 @@
 #include 
 #include 
 #include 
+#include 
 
 using namespace boost::range;
 
@@ -138,10 +139,11 @@ void run(const std::vector& args) {
 shared_app_t application::run_and_join(const std::string& command,
   const std::vector& args,
   bool new_program,
-  const std::unordered_map *env)
+  const std::unordered_map *env,
+  waiter* setup_waiter)
 {
 auto app = std::make_shared(command, args, new_program, env);
-app->start_and_join();
+app->start_and_join(setup_waiter);
 return app;
 }
 
@@ -235,7 +237,7 @@ int application::join()
 return _return_code;
 }
 
-void application::start_and_join()
+void application::start_and_join(waiter* setup_waiter)
 {
 // We start the new application code in the current thread. We temporarily
 // change the app_runtime pointer of this thread, while keeping the old
@@ -244,6 +246,9 @@ void application::start_and_join()
 auto original_app = sched::thread::current()->app_runtime();
 sched::thread::current()->set_app_runtime(runtime());
 auto original_name = sched::thread::current()->name();
+if (setup_waiter) {
+setup_waiter->wake();
+}
 _thread = pthread_self(); // may be null if the caller is not a pthread.
 main();
 sched::thread::current()->set_name(original_name);
diff --git a/core/osv_execve.cc b/core/osv_execve.cc
index a7594b3..218cc78 100644
--- a/core/osv_execve.cc
+++ b/core/osv_execve.cc
@@ -2,6 +2,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 /* Record thread state changes (termination) by storing exit status into a map.
@@ -19,19 +20,16 @@ static int thread_run_app_in_namespace(std::string filename,
 const std::unordered_map envp,
 long* thread_id,
 int notification_fd,
-sched::thread* parent)
+waiter* parent_waiter)
 {
-int ret;
 const bool new_program = true; // run in new ELF namespace
 long tid = gettid(); // sched::thread::current()->id();
 
 debugf_execve("thread_run_app_in_namespace... tid=%ld\n", tid);
-if (thread_id) {
-parent->wake_with([&] { 

[PATCH 2/2] syscalls: Add most syscalls required by GO binaries

2016-08-25 Thread Benoit Canet
The commented one will require manual work.

Signed-off-by: Benoît Canet 
---
 linux.cc   |  56 
 syscalls.h | 141 +
 2 files changed, 180 insertions(+), 17 deletions(-)
 create mode 100644 syscalls.h

diff --git a/linux.cc b/linux.cc
index 8a2a4a3..3097e91 100644
--- a/linux.cc
+++ b/linux.cc
@@ -24,6 +24,35 @@
 #include 
 #include 
 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+
 #include 
 
 extern "C" long gettid()
@@ -263,27 +292,20 @@ long long_mmap(void *addr, size_t length, int prot, int 
flags, int fd, off_t off
 return fn(arg1, arg2, arg3, arg4, arg5, arg6);  \
 } while (0)
 
+int rt_sigaction(int sig, const struct sigaction * act, struct sigaction * 
oact, size_t sigsetsize)
+{
+return sigaction(sig, act, oact);
+}
+
+int rt_sigprocmask(int how, sigset_t * nset, sigset_t * oset, size_t 
sigsetsize)
+{
+return sigprocmask(how, nset, oset);
+}
 
 long syscall(long number, ...)
 {
 switch (number) {
-SYSCALL1(uname, struct utsname *);
-SYSCALL3(write, int, const void *, size_t);
-SYSCALL0(gettid);
-SYSCALL2(clock_gettime, clockid_t, struct timespec *);
-SYSCALL2(clock_getres, clockid_t, struct timespec *);
-SYSCALL6(futex, int *, int, int, const struct timespec *, int *, int);
-SYSCALL1(close, int);
-SYSCALL2(pipe2, int *, int);
-SYSCALL1(epoll_create1, int);
-SYSCALL2(eventfd2, unsigned int, int);
-SYSCALL4(epoll_ctl, int, int, int, struct epoll_event *);
-SYSCALL4(epoll_wait, int, struct epoll_event *, int, int);
-SYSCALL4(accept4, int, struct sockaddr *, socklen_t *, int);
-SYSCALL5(get_mempolicy, int *, unsigned long *, unsigned long, void *, 
int);
-SYSCALL3(sched_getaffinity_syscall, pid_t, unsigned, unsigned long *);
-SYSCALL6(long_mmap, void *, size_t, int, int, int, off_t);
-SYSCALL2(munmap, void *, size_t);
+#include "syscalls.h"
 }
 
 debug_always("syscall(): unimplemented system call %d\n", number);
diff --git a/syscalls.h b/syscalls.h
new file mode 100644
index 000..cb86390
--- /dev/null
+++ b/syscalls.h
@@ -0,0 +1,141 @@
+SYSCALL0(gettid);
+SYSCALL2(clock_gettime, clockid_t, struct timespec *);
+SYSCALL2(clock_getres, clockid_t, struct timespec *);
+SYSCALL6(futex, int *, int, int, const struct timespec *, int *, int);
+SYSCALL1(close, int);
+SYSCALL2(pipe2, int *, int);
+SYSCALL1(epoll_create1, int);
+SYSCALL2(eventfd2, unsigned int, int);
+SYSCALL4(epoll_ctl, int, int, int, struct epoll_event *);
+SYSCALL4(epoll_wait, int, struct epoll_event *, int, int);
+SYSCALL4(accept4, int, struct sockaddr *, socklen_t *, int);
+SYSCALL5(get_mempolicy, int *, unsigned long *, unsigned long, void *, int);
+SYSCALL3(sched_getaffinity_syscall, pid_t, unsigned, unsigned long *);
+SYSCALL4(rt_sigaction, int, const struct sigaction *, struct sigaction *, 
size_t);
+SYSCALL4(rt_sigprocmask, int, sigset_t *, sigset_t *, size_t);
+SYSCALL1(uname, struct utsname *);
+SYSCALL6(long_mmap, void *, size_t, int, int, int, off_t);
+SYSCALL2(munmap, void *, size_t);
+SYSCALL3(read, int, char *, size_t);
+SYSCALL3(write, int, const char *, size_t);
+SYSCALL2(stat, const char *, struct stat *);
+SYSCALL2(fstat, int, struct stat *);
+SYSCALL2(lstat, const char *, struct stat *);
+SYSCALL3(lseek, unsigned int, off_t, unsigned int);
+SYSCALL3(mprotect, void *, size_t, int);
+SYSCALL4(pread64, unsigned int, char *, size_t, loff_t);
+SYSCALL4(pwrite64, unsigned int, const char *, size_t, loff_t);
+SYSCALL1(pipe, int *);
+SYSCALL5(select, int, fd_set *, fd_set *, fd_set *, struct timeval *);
+SYSCALL3(madvise, void *, size_t, int);
+SYSCALL1(dup, unsigned int);
+SYSCALL2(dup2, unsigned int, unsigned int);
+SYSCALL0(pause);
+SYSCALL2(nanosleep, struct timespec *, struct timespec *);
+SYSCALL0(getpid);
+SYSCALL4(sendfile, int, int, off_t *, size_t);
+SYSCALL3(socket, int, int, int);
+SYSCALL3(connect, int, struct sockaddr *, int);
+SYSCALL3(accept, int, struct sockaddr *, unsigned int *);
+SYSCALL6(sendto, int, void *, size_t, unsigned int, struct sockaddr *, int);
+SYSCALL6(recvfrom, int, void *, size_t, unsigned int, struct sockaddr *, 
unsigned int *);
+SYSCALL3(sendmsg, int, struct msghdr *, int);
+SYSCALL3(recvmsg, int, struct msghdr *, int);
+SYSCALL2(shutdown, int, int);
+SYSCALL3(bind, int, struct sockaddr *, int);
+SYSCALL2(listen, int, int);
+SYSCALL3(getsockname, int, struct sockaddr *, unsigned int *);
+SYSCALL3(getpeername, int, struct sockaddr *, unsigned int *);
+SYSCALL4(socketpair, int, int, int, int *);
+SYSCALL5(setsockopt, int, int, int, char *, int);
+SYSCALL5(getsockopt, int, int, int, char *, unsigned int *);
+SYSCALL2(kill, 

[PATCH 1/2] x64: 'syscall' instruction support

2016-08-25 Thread Benoit Canet
Enable "fast system calls" via the 'syscall' instruction on OSv. The
instruction is used by Go programs on Linux/x86-64 for system calls.

Signed-off-by: Pekka Enberg 
Signed-off-by: Benoît Canet 
---
 arch/x64/arch-setup.cc | 12 
 arch/x64/entry.S   | 20 
 arch/x64/msr.hh|  3 +++
 linux.cc   |  9 +
 4 files changed, 44 insertions(+)

diff --git a/arch/x64/arch-setup.cc b/arch/x64/arch-setup.cc
index 5e76d82..520651d 100644
--- a/arch/x64/arch-setup.cc
+++ b/arch/x64/arch-setup.cc
@@ -10,6 +10,7 @@
 #include 
 #include 
 #include "processor.hh"
+#include "processor-flags.h"
 #include "msr.hh"
 #include "xen.hh"
 #include 
@@ -213,6 +214,16 @@ static inline void disable_pic()
 XENPV_ALTERNATIVE({ processor::outb(0xff, 0x21); processor::outb(0xff, 
0xa1); }, {});
 }
 
+extern "C" void syscall_entry(void);
+
+static void setup_syscall()
+{
+processor::wrmsr(msr::IA32_STAR,  static_cast(1*8) << 32);
+processor::wrmsr(msr::IA32_LSTAR, 
reinterpret_cast(syscall_entry));
+processor::wrmsr(msr::IA32_FMASK, 0);
+processor::wrmsr(msr::IA32_EFER,  processor::rdmsr(msr::IA32_EFER) | 0x01);
+}
+
 void arch_init_premain()
 {
 auto omb = *osv_multiboot_info;
@@ -220,6 +231,7 @@ void arch_init_premain()
debug_early_u64("Error reading disk (real mode): ", 
static_cast(omb.disk_err));
 
 disable_pic();
+setup_syscall();
 }
 
 #include "drivers/driver.hh"
diff --git a/arch/x64/entry.S b/arch/x64/entry.S
index b6f5abe..d3a864a 100644
--- a/arch/x64/entry.S
+++ b/arch/x64/entry.S
@@ -159,3 +159,23 @@ call_signal_handler_thunk:
 iretq
 .cfi_endproc
 
+.global syscall_entry
+syscall_entry:
+   # There is no ring transition and rflags are left unchanged. The only
+   # thing we need to save is the rip which is stored in rcx by the syscall
+   # instruction.
+   push %rcx
+   # FIXME: registers clobbered?
+   # FIXME: FPU state?
+   # FIXME: system call arguments?
+   movq %r10, %rcx
+   # rotate syscall arguments
+   movq %r8,  %r9
+   movq %rcx, %r8
+   movq %rdx, %rcx
+   movq %rsi, %rdx
+   movq %rdi, %rsi
+   movq %rax, %rdi
+   call syscall_wrapper
+   pop %rcx
+   jmp *%rcx
diff --git a/arch/x64/msr.hh b/arch/x64/msr.hh
index 154bba7..d77c75c 100644
--- a/arch/x64/msr.hh
+++ b/arch/x64/msr.hh
@@ -58,6 +58,9 @@ enum class msr : uint32_t {
 
 IA32_APIC_BASE = 0x001b,
 IA32_EFER = 0xc080,
+IA32_STAR = 0xc081,
+IA32_LSTAR = 0xc082,
+IA32_FMASK = 0xc084,
 IA32_FS_BASE = 0xc100,
 
 KVM_WALL_CLOCK = 0x11,
diff --git a/linux.cc b/linux.cc
index bd82ca9..8a2a4a3 100644
--- a/linux.cc
+++ b/linux.cc
@@ -291,3 +291,12 @@ long syscall(long number, ...)
 return -1;
 }
 long __syscall(long number, ...)  __attribute__((alias("syscall")));
+
+extern "C" long syscall_wrapper(long number, ...)
+{
+auto ret = syscall(number);
+if (ret < 0) {
+return -errno;
+}
+return 0;
+}
-- 
2.7.4

-- 
You received this message because you are subscribed to the Google Groups "OSv 
Development" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to osv-dev+unsubscr...@googlegroups.com.
For more options, visit https://groups.google.com/d/optout.