net/mlx5: Fix race between PCI error handlers and health work
[cascardo/linux.git] / drivers / net / ethernet / mellanox / mlx5 / core / main.c
index d9c3c70..9f90226 100644 (file)
@@ -844,12 +844,6 @@ static int mlx5_init_once(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
        struct pci_dev *pdev = dev->pdev;
        int err;
 
-       err = mlx5_query_hca_caps(dev);
-       if (err) {
-               dev_err(&pdev->dev, "query hca failed\n");
-               goto out;
-       }
-
        err = mlx5_query_board_id(dev);
        if (err) {
                dev_err(&pdev->dev, "query board id failed\n");
@@ -1023,6 +1017,12 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
 
        mlx5_start_health_poll(dev);
 
+       err = mlx5_query_hca_caps(dev);
+       if (err) {
+               dev_err(&pdev->dev, "query hca failed\n");
+               goto err_stop_poll;
+       }
+
        if (boot && mlx5_init_once(dev, priv)) {
                dev_err(&pdev->dev, "sw objs init failed\n");
                goto err_stop_poll;
@@ -1315,8 +1315,13 @@ static pci_ers_result_t mlx5_pci_err_detected(struct pci_dev *pdev,
        dev_info(&pdev->dev, "%s was called\n", __func__);
        mlx5_enter_error_state(dev);
        mlx5_unload_one(dev, priv, false);
-       pci_save_state(pdev);
-       mlx5_pci_disable_device(dev);
+       /* In case of kernel call save the pci state and drain health wq */
+       if (state) {
+               pci_save_state(pdev);
+               mlx5_drain_health_wq(dev);
+               mlx5_pci_disable_device(dev);
+       }
+
        return state == pci_channel_io_perm_failure ?
                PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_NEED_RESET;
 }