Re: [PATCH 08/15] habanalabs: add event queue and interrupts

2019-01-28 Thread Oded Gabbay
On Fri, Jan 25, 2019 at 9:51 AM Mike Rapoport  wrote:
>
> On Wed, Jan 23, 2019 at 02:00:50AM +0200, Oded Gabbay wrote:
> > This patch adds support for receiving events from Goya's control CPU and
> > for receiving MSI-X interrupts from Goya's DMA engines and CPU.
> >
> > Goya's PCI controller supports up to 8 MSI-X interrupts, which only 6 of
> > them are currently used. The first 5 interrupts are dedicated for Goya's
> > DMA engine queues. The 6th interrupt is dedicated for Goya's control CPU.
> >
> > The DMA queue will signal its MSI-X entry upon each completion of a command
> > buffer that was placed on its primary queue. The driver will then mark that
> > CB as completed and free the related resources. It will also update the
> > command submission object which that CB belongs to.
> >
> > There is a dedicated event queue (EQ) between the driver and Goya's control
> > CPU. The EQ is located on the Host memory. The control CPU writes a new
> > entry to the EQ for various reasons, such as ECC error, MMU page fault, Hot
> > temperature. After writing the new entry to the EQ, the control CPU will
> > trigger its dedicated MSI-X entry to signal the driver that there is a new
> > entry in the EQ. The driver will then read the entry and act accordingly.
> >
> > Signed-off-by: Oded Gabbay 
> > ---
> >  drivers/misc/habanalabs/device.c|  35 +-
> >  drivers/misc/habanalabs/goya/goya.c | 522 +++-
> >  drivers/misc/habanalabs/goya/goyaP.h|   1 +
> >  drivers/misc/habanalabs/habanalabs.h|  37 ++
> >  drivers/misc/habanalabs/include/goya/goya.h |   1 -
> >  drivers/misc/habanalabs/irq.c   | 144 ++
> >  6 files changed, 729 insertions(+), 11 deletions(-)
> >
> > diff --git a/drivers/misc/habanalabs/device.c 
> > b/drivers/misc/habanalabs/device.c
> > index 98220628a467..9199e070e79e 100644
> > --- a/drivers/misc/habanalabs/device.c
> > +++ b/drivers/misc/habanalabs/device.c
> > @@ -173,9 +173,17 @@ static int device_early_init(struct hl_device *hdev)
> >   hdev->cq_wq = alloc_workqueue("hl-free-jobs", WQ_UNBOUND, 0);
> >   if (hdev->cq_wq == NULL) {
> >   dev_err(hdev->dev, "Failed to allocate CQ workqueue\n");
> > + rc = -ENOMEM;
>
> Apparently, it should have been in one of the earlier patches
>
Correct, fixed
> >   goto asid_fini;
> >   }
> >
> > + hdev->eq_wq = alloc_workqueue("hl-events", WQ_UNBOUND, 0);
> > + if (hdev->eq_wq == NULL) {
> > + dev_err(hdev->dev, "Failed to allocate EQ workqueue\n");
> > + rc = -ENOMEM;
> > + goto free_cq_wq;
> > + }
> > +
> >   hl_cb_mgr_init(>kernel_cb_mgr);
> >
> >   mutex_init(>device_open);
> > @@ -184,6 +192,8 @@ static int device_early_init(struct hl_device *hdev)
> >
> >   return 0;
> >
> > +free_cq_wq:
> > + destroy_workqueue(hdev->cq_wq);
> >  asid_fini:
> >   hl_asid_fini(hdev);
> >  early_fini:
> > @@ -205,6 +215,7 @@ static void device_early_fini(struct hl_device *hdev)
> >
> >   hl_cb_mgr_fini(hdev, >kernel_cb_mgr);
> >
> > + destroy_workqueue(hdev->eq_wq);
> >   destroy_workqueue(hdev->cq_wq);
> >
> >   hl_asid_fini(hdev);
> > @@ -343,11 +354,22 @@ int hl_device_init(struct hl_device *hdev, struct 
> > class *hclass)
> >   }
> >   }
> >
> > + /*
> > +  * Initialize the event queue. Must be done before hw_init,
> > +  * because there the address of the event queue is being
> > +  * passed as argument to request_irq
> > +  */
> > + rc = hl_eq_init(hdev, >event_queue);
> > + if (rc) {
> > + dev_err(hdev->dev, "failed to initialize event queue\n");
> > + goto cq_fini;
> > + }
> > +
> >   /* Allocate the kernel context */
> >   hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx), GFP_KERNEL);
> >   if (!hdev->kernel_ctx) {
> >   rc = -ENOMEM;
> > - goto cq_fini;
> > + goto eq_fini;
> >   }
> >
> >   hdev->user_ctx = NULL;
> > @@ -392,6 +414,8 @@ int hl_device_init(struct hl_device *hdev, struct class 
> > *hclass)
> >   "kernel ctx is still alive on initialization 
> > failure\n");
> >  free_ctx:
> >   kfree(hdev->kernel_ctx);
> > +eq_fini:
> > + hl_eq_fini(hdev, >event_queue);
> >  cq_fini:
> >   for (i = 0 ; i < cq_ready_cnt ; i++)
> >   hl_cq_fini(hdev, >completion_queue[i]);
> > @@ -433,6 +457,13 @@ void hl_device_fini(struct hl_device *hdev)
> >   /* Mark device as disabled */
> >   hdev->disabled = true;
> >
> > + /*
> > +  * Halt the engines and disable interrupts so we won't get any more
> > +  * completions from H/W and we won't have any accesses from the
> > +  * H/W to the host machine
> > +  */
> > + hdev->asic_funcs->halt_engines(hdev, true);
> > +
> >   hl_cb_pool_fini(hdev);
> >
> >   /* Release kernel context */
> > @@ -442,6 +473,8 @@ void 

Re: [PATCH 08/15] habanalabs: add event queue and interrupts

2019-01-24 Thread Mike Rapoport
On Wed, Jan 23, 2019 at 02:00:50AM +0200, Oded Gabbay wrote:
> This patch adds support for receiving events from Goya's control CPU and
> for receiving MSI-X interrupts from Goya's DMA engines and CPU.
> 
> Goya's PCI controller supports up to 8 MSI-X interrupts, which only 6 of
> them are currently used. The first 5 interrupts are dedicated for Goya's
> DMA engine queues. The 6th interrupt is dedicated for Goya's control CPU.
> 
> The DMA queue will signal its MSI-X entry upon each completion of a command
> buffer that was placed on its primary queue. The driver will then mark that
> CB as completed and free the related resources. It will also update the
> command submission object which that CB belongs to.
> 
> There is a dedicated event queue (EQ) between the driver and Goya's control
> CPU. The EQ is located on the Host memory. The control CPU writes a new
> entry to the EQ for various reasons, such as ECC error, MMU page fault, Hot
> temperature. After writing the new entry to the EQ, the control CPU will
> trigger its dedicated MSI-X entry to signal the driver that there is a new
> entry in the EQ. The driver will then read the entry and act accordingly.
> 
> Signed-off-by: Oded Gabbay 
> ---
>  drivers/misc/habanalabs/device.c|  35 +-
>  drivers/misc/habanalabs/goya/goya.c | 522 +++-
>  drivers/misc/habanalabs/goya/goyaP.h|   1 +
>  drivers/misc/habanalabs/habanalabs.h|  37 ++
>  drivers/misc/habanalabs/include/goya/goya.h |   1 -
>  drivers/misc/habanalabs/irq.c   | 144 ++
>  6 files changed, 729 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/misc/habanalabs/device.c 
> b/drivers/misc/habanalabs/device.c
> index 98220628a467..9199e070e79e 100644
> --- a/drivers/misc/habanalabs/device.c
> +++ b/drivers/misc/habanalabs/device.c
> @@ -173,9 +173,17 @@ static int device_early_init(struct hl_device *hdev)
>   hdev->cq_wq = alloc_workqueue("hl-free-jobs", WQ_UNBOUND, 0);
>   if (hdev->cq_wq == NULL) {
>   dev_err(hdev->dev, "Failed to allocate CQ workqueue\n");
> + rc = -ENOMEM;

Apparently, it should have been in one of the earlier patches

>   goto asid_fini;
>   }
>  
> + hdev->eq_wq = alloc_workqueue("hl-events", WQ_UNBOUND, 0);
> + if (hdev->eq_wq == NULL) {
> + dev_err(hdev->dev, "Failed to allocate EQ workqueue\n");
> + rc = -ENOMEM;
> + goto free_cq_wq;
> + }
> +
>   hl_cb_mgr_init(>kernel_cb_mgr);
>  
>   mutex_init(>device_open);
> @@ -184,6 +192,8 @@ static int device_early_init(struct hl_device *hdev)
>  
>   return 0;
>  
> +free_cq_wq:
> + destroy_workqueue(hdev->cq_wq);
>  asid_fini:
>   hl_asid_fini(hdev);
>  early_fini:
> @@ -205,6 +215,7 @@ static void device_early_fini(struct hl_device *hdev)
>  
>   hl_cb_mgr_fini(hdev, >kernel_cb_mgr);
>  
> + destroy_workqueue(hdev->eq_wq);
>   destroy_workqueue(hdev->cq_wq);
>  
>   hl_asid_fini(hdev);
> @@ -343,11 +354,22 @@ int hl_device_init(struct hl_device *hdev, struct class 
> *hclass)
>   }
>   }
>  
> + /*
> +  * Initialize the event queue. Must be done before hw_init,
> +  * because there the address of the event queue is being
> +  * passed as argument to request_irq
> +  */
> + rc = hl_eq_init(hdev, >event_queue);
> + if (rc) {
> + dev_err(hdev->dev, "failed to initialize event queue\n");
> + goto cq_fini;
> + }
> +
>   /* Allocate the kernel context */
>   hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx), GFP_KERNEL);
>   if (!hdev->kernel_ctx) {
>   rc = -ENOMEM;
> - goto cq_fini;
> + goto eq_fini;
>   }
>  
>   hdev->user_ctx = NULL;
> @@ -392,6 +414,8 @@ int hl_device_init(struct hl_device *hdev, struct class 
> *hclass)
>   "kernel ctx is still alive on initialization 
> failure\n");
>  free_ctx:
>   kfree(hdev->kernel_ctx);
> +eq_fini:
> + hl_eq_fini(hdev, >event_queue);
>  cq_fini:
>   for (i = 0 ; i < cq_ready_cnt ; i++)
>   hl_cq_fini(hdev, >completion_queue[i]);
> @@ -433,6 +457,13 @@ void hl_device_fini(struct hl_device *hdev)
>   /* Mark device as disabled */
>   hdev->disabled = true;
>  
> + /*
> +  * Halt the engines and disable interrupts so we won't get any more
> +  * completions from H/W and we won't have any accesses from the
> +  * H/W to the host machine
> +  */
> + hdev->asic_funcs->halt_engines(hdev, true);
> +
>   hl_cb_pool_fini(hdev);
>  
>   /* Release kernel context */
> @@ -442,6 +473,8 @@ void hl_device_fini(struct hl_device *hdev)
>   /* Reset the H/W. It will be in idle state after this returns */
>   hdev->asic_funcs->hw_fini(hdev, true);
>  
> + hl_eq_fini(hdev, >event_queue);
> +
>   for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
>   

[PATCH 08/15] habanalabs: add event queue and interrupts

2019-01-22 Thread Oded Gabbay
This patch adds support for receiving events from Goya's control CPU and
for receiving MSI-X interrupts from Goya's DMA engines and CPU.

Goya's PCI controller supports up to 8 MSI-X interrupts, which only 6 of
them are currently used. The first 5 interrupts are dedicated for Goya's
DMA engine queues. The 6th interrupt is dedicated for Goya's control CPU.

The DMA queue will signal its MSI-X entry upon each completion of a command
buffer that was placed on its primary queue. The driver will then mark that
CB as completed and free the related resources. It will also update the
command submission object which that CB belongs to.

There is a dedicated event queue (EQ) between the driver and Goya's control
CPU. The EQ is located on the Host memory. The control CPU writes a new
entry to the EQ for various reasons, such as ECC error, MMU page fault, Hot
temperature. After writing the new entry to the EQ, the control CPU will
trigger its dedicated MSI-X entry to signal the driver that there is a new
entry in the EQ. The driver will then read the entry and act accordingly.

Signed-off-by: Oded Gabbay 
---
 drivers/misc/habanalabs/device.c|  35 +-
 drivers/misc/habanalabs/goya/goya.c | 522 +++-
 drivers/misc/habanalabs/goya/goyaP.h|   1 +
 drivers/misc/habanalabs/habanalabs.h|  37 ++
 drivers/misc/habanalabs/include/goya/goya.h |   1 -
 drivers/misc/habanalabs/irq.c   | 144 ++
 6 files changed, 729 insertions(+), 11 deletions(-)

diff --git a/drivers/misc/habanalabs/device.c b/drivers/misc/habanalabs/device.c
index 98220628a467..9199e070e79e 100644
--- a/drivers/misc/habanalabs/device.c
+++ b/drivers/misc/habanalabs/device.c
@@ -173,9 +173,17 @@ static int device_early_init(struct hl_device *hdev)
hdev->cq_wq = alloc_workqueue("hl-free-jobs", WQ_UNBOUND, 0);
if (hdev->cq_wq == NULL) {
dev_err(hdev->dev, "Failed to allocate CQ workqueue\n");
+   rc = -ENOMEM;
goto asid_fini;
}
 
+   hdev->eq_wq = alloc_workqueue("hl-events", WQ_UNBOUND, 0);
+   if (hdev->eq_wq == NULL) {
+   dev_err(hdev->dev, "Failed to allocate EQ workqueue\n");
+   rc = -ENOMEM;
+   goto free_cq_wq;
+   }
+
hl_cb_mgr_init(>kernel_cb_mgr);
 
mutex_init(>device_open);
@@ -184,6 +192,8 @@ static int device_early_init(struct hl_device *hdev)
 
return 0;
 
+free_cq_wq:
+   destroy_workqueue(hdev->cq_wq);
 asid_fini:
hl_asid_fini(hdev);
 early_fini:
@@ -205,6 +215,7 @@ static void device_early_fini(struct hl_device *hdev)
 
hl_cb_mgr_fini(hdev, >kernel_cb_mgr);
 
+   destroy_workqueue(hdev->eq_wq);
destroy_workqueue(hdev->cq_wq);
 
hl_asid_fini(hdev);
@@ -343,11 +354,22 @@ int hl_device_init(struct hl_device *hdev, struct class 
*hclass)
}
}
 
+   /*
+* Initialize the event queue. Must be done before hw_init,
+* because there the address of the event queue is being
+* passed as argument to request_irq
+*/
+   rc = hl_eq_init(hdev, >event_queue);
+   if (rc) {
+   dev_err(hdev->dev, "failed to initialize event queue\n");
+   goto cq_fini;
+   }
+
/* Allocate the kernel context */
hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx), GFP_KERNEL);
if (!hdev->kernel_ctx) {
rc = -ENOMEM;
-   goto cq_fini;
+   goto eq_fini;
}
 
hdev->user_ctx = NULL;
@@ -392,6 +414,8 @@ int hl_device_init(struct hl_device *hdev, struct class 
*hclass)
"kernel ctx is still alive on initialization 
failure\n");
 free_ctx:
kfree(hdev->kernel_ctx);
+eq_fini:
+   hl_eq_fini(hdev, >event_queue);
 cq_fini:
for (i = 0 ; i < cq_ready_cnt ; i++)
hl_cq_fini(hdev, >completion_queue[i]);
@@ -433,6 +457,13 @@ void hl_device_fini(struct hl_device *hdev)
/* Mark device as disabled */
hdev->disabled = true;
 
+   /*
+* Halt the engines and disable interrupts so we won't get any more
+* completions from H/W and we won't have any accesses from the
+* H/W to the host machine
+*/
+   hdev->asic_funcs->halt_engines(hdev, true);
+
hl_cb_pool_fini(hdev);
 
/* Release kernel context */
@@ -442,6 +473,8 @@ void hl_device_fini(struct hl_device *hdev)
/* Reset the H/W. It will be in idle state after this returns */
hdev->asic_funcs->hw_fini(hdev, true);
 
+   hl_eq_fini(hdev, >event_queue);
+
for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
hl_cq_fini(hdev, >completion_queue[i]);
kfree(hdev->completion_queue);
diff --git a/drivers/misc/habanalabs/goya/goya.c 
b/drivers/misc/habanalabs/goya/goya.c
index 08d5227eaf1d..6c04277ae0fa 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++