drivers/infiniband/hw/hfi1/affinity.c

   1 /*
   2  * Copyright(c) 2015, 2016 Intel Corporation.
   3  *
   4  * This file is provided under a dual BSD/GPLv2 license.  When using or
   5  * redistributing this file, you may do so under either license.
   6  *
   7  * GPL LICENSE SUMMARY
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of version 2 of the GNU General Public License as
  11  * published by the Free Software Foundation.
  12  *
  13  * This program is distributed in the hope that it will be useful, but
  14  * WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16  * General Public License for more details.
  17  *
  18  * BSD LICENSE
  19  *
  20  * Redistribution and use in source and binary forms, with or without
  21  * modification, are permitted provided that the following conditions
  22  * are met:
  23  *
  24  *  - Redistributions of source code must retain the above copyright
  25  *    notice, this list of conditions and the following disclaimer.
  26  *  - Redistributions in binary form must reproduce the above copyright
  27  *    notice, this list of conditions and the following disclaimer in
  28  *    the documentation and/or other materials provided with the
  29  *    distribution.
  30  *  - Neither the name of Intel Corporation nor the names of its
  31  *    contributors may be used to endorse or promote products derived
  32  *    from this software without specific prior written permission.
  33  *
  34  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  35  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  36  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  37  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  38  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  39  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  40  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  41  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  42  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  43  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  44  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  45  *
  46  */
  47 #include <linux/topology.h>
  48 #include <linux/cpumask.h>
  49 #include <linux/module.h>
  50
  51 #include "hfi.h"
  52 #include "affinity.h"
  53 #include "sdma.h"
  54 #include "trace.h"
  55
  56 struct hfi1_affinity_node_list node_affinity = {
  57         .list = LIST_HEAD_INIT(node_affinity.list),
  58         .lock = __SPIN_LOCK_UNLOCKED(&node_affinity.lock),
  59 };
  60
  61 /* Name of IRQ types, indexed by enum irq_type */
  62 static const char * const irq_type_names[] = {
  63         "SDMA",
  64         "RCVCTXT",
  65         "GENERAL",
  66         "OTHER",
  67 };
  68
  69 static inline void init_cpu_mask_set(struct cpu_mask_set *set)
  70 {
  71         cpumask_clear(&set->mask);
  72         cpumask_clear(&set->used);
  73         set->gen = 0;
  74 }
  75
  76 /* Initialize non-HT cpu cores mask */
  77 void init_real_cpu_mask(void)
  78 {
  79         int possible, curr_cpu, i, ht;
  80
  81         cpumask_clear(&node_affinity.real_cpu_mask);
  82
  83         /* Start with cpu online mask as the real cpu mask */
  84         cpumask_copy(&node_affinity.real_cpu_mask, cpu_online_mask);
  85
  86         /*
  87          * Remove HT cores from the real cpu mask.  Do this in two steps below.
  88          */
  89         possible = cpumask_weight(&node_affinity.real_cpu_mask);
  90         ht = cpumask_weight(topology_sibling_cpumask(
  91                                 cpumask_first(&node_affinity.real_cpu_mask)));
  92         /*
  93          * Step 1.  Skip over the first N HT siblings and use them as the
  94          * "real" cores.  Assumes that HT cores are not enumerated in
  95          * succession (except in the single core case).
  96          */
  97         curr_cpu = cpumask_first(&node_affinity.real_cpu_mask);
  98         for (i = 0; i < possible / ht; i++)
  99                 curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask);
 100         /*
 101          * Step 2.  Remove the remaining HT siblings.  Use cpumask_next() to
 102          * skip any gaps.
 103          */
 104         for (; i < possible; i++) {
 105                 cpumask_clear_cpu(curr_cpu, &node_affinity.real_cpu_mask);
 106                 curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask);
 107         }
 108 }
 109
 110 void node_affinity_init(void)
 111 {
 112         cpumask_copy(&node_affinity.proc.mask, cpu_online_mask);
 113         /*
 114          * The real cpu mask is part of the affinity struct but it has to be
 115          * initialized early. It is needed to calculate the number of user
 116          * contexts in set_up_context_variables().
 117          */
 118         init_real_cpu_mask();
 119 }
 120
 121 void node_affinity_destroy(void)
 122 {
 123         struct list_head *pos, *q;
 124         struct hfi1_affinity_node *entry;
 125
 126         spin_lock(&node_affinity.lock);
 127         list_for_each_safe(pos, q, &node_affinity.list) {
 128                 entry = list_entry(pos, struct hfi1_affinity_node,
 129                                    list);
 130                 list_del(pos);
 131                 kfree(entry);
 132         }
 133         spin_unlock(&node_affinity.lock);
 134 }
 135
 136 static struct hfi1_affinity_node *node_affinity_allocate(int node)
 137 {
 138         struct hfi1_affinity_node *entry;
 139
 140         entry = kzalloc(sizeof(*entry), GFP_KERNEL);
 141         if (!entry)
 142                 return NULL;
 143         entry->node = node;
 144         INIT_LIST_HEAD(&entry->list);
 145
 146         return entry;
 147 }
 148
 149 /*
 150  * It appends an entry to the list.
 151  * It *must* be called with node_affinity.lock held.
 152  */
 153 static void node_affinity_add_tail(struct hfi1_affinity_node *entry)
 154 {
 155         list_add_tail(&entry->list, &node_affinity.list);
 156 }
 157
 158 /* It must be called with node_affinity.lock held */
 159 static struct hfi1_affinity_node *node_affinity_lookup(int node)
 160 {
 161         struct list_head *pos;
 162         struct hfi1_affinity_node *entry;
 163
 164         list_for_each(pos, &node_affinity.list) {
 165                 entry = list_entry(pos, struct hfi1_affinity_node, list);
 166                 if (entry->node == node)
 167                         return entry;
 168         }
 169
 170         return NULL;
 171 }
 172
 173 /*
 174  * Interrupt affinity.
 175  *
 176  * non-rcv avail gets a default mask that
 177  * starts as possible cpus with threads reset
 178  * and each rcv avail reset.
 179  *
 180  * rcv avail gets node relative 1 wrapping back
 181  * to the node relative 1 as necessary.
 182  *
 183  */
 184 int hfi1_dev_affinity_init(struct hfi1_devdata *dd)
 185 {
 186         int node = pcibus_to_node(dd->pcidev->bus);
 187         struct hfi1_affinity_node *entry;
 188         const struct cpumask *local_mask;
 189         int curr_cpu, possible, i;
 190
 191         if (node < 0)
 192                 node = numa_node_id();
 193         dd->node = node;
 194
 195         local_mask = cpumask_of_node(dd->node);
 196         if (cpumask_first(local_mask) >= nr_cpu_ids)
 197                 local_mask = topology_core_cpumask(0);
 198
 199         spin_lock(&node_affinity.lock);
 200         entry = node_affinity_lookup(dd->node);
 201         spin_unlock(&node_affinity.lock);
 202
 203         /*
 204          * If this is the first time this NUMA node's affinity is used,
 205          * create an entry in the global affinity structure and initialize it.
 206          */
 207         if (!entry) {
 208                 entry = node_affinity_allocate(node);
 209                 if (!entry) {
 210                         dd_dev_err(dd,
 211                                    "Unable to allocate global affinity node\n");
 212                         return -ENOMEM;
 213                 }
 214                 init_cpu_mask_set(&entry->def_intr);
 215                 init_cpu_mask_set(&entry->rcv_intr);
 216                 /* Use the "real" cpu mask of this node as the default */
 217                 cpumask_and(&entry->def_intr.mask, &node_affinity.real_cpu_mask,
 218                             local_mask);
 219
 220                 /* fill in the receive list */
 221                 possible = cpumask_weight(&entry->def_intr.mask);
 222                 curr_cpu = cpumask_first(&entry->def_intr.mask);
 223
 224                 if (possible == 1) {
 225                         /* only one CPU, everyone will use it */
 226                         cpumask_set_cpu(curr_cpu, &entry->rcv_intr.mask);
 227                 } else {
 228                         /*
 229                          * Retain the first CPU in the default list for the
 230                          * control context.
 231                          */
 232                         curr_cpu = cpumask_next(curr_cpu,
 233                                                 &entry->def_intr.mask);
 234
 235                         /*
 236                          * Remove the remaining kernel receive queues from
 237                          * the default list and add them to the receive list.
 238                          */
 239                         for (i = 0; i < dd->n_krcv_queues - 1; i++) {
 240                                 cpumask_clear_cpu(curr_cpu,
 241                                                   &entry->def_intr.mask);
 242                                 cpumask_set_cpu(curr_cpu,
 243                                                 &entry->rcv_intr.mask);
 244                                 curr_cpu = cpumask_next(curr_cpu,
 245                                                         &entry->def_intr.mask);
 246                                 if (curr_cpu >= nr_cpu_ids)
 247                                         break;
 248                         }
 249                 }
 250
 251                 spin_lock(&node_affinity.lock);
 252                 node_affinity_add_tail(entry);
 253                 spin_unlock(&node_affinity.lock);
 254         }
 255
 256         return 0;
 257 }
 258
 259 int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix)
 260 {
 261         int ret;
 262         cpumask_var_t diff;
 263         struct hfi1_affinity_node *entry;
 264         struct cpu_mask_set *set;
 265         struct sdma_engine *sde = NULL;
 266         struct hfi1_ctxtdata *rcd = NULL;
 267         char extra[64];
 268         int cpu = -1;
 269
 270         extra[0] = '\0';
 271         cpumask_clear(&msix->mask);
 272
 273         ret = zalloc_cpumask_var(&diff, GFP_KERNEL);
 274         if (!ret)
 275                 return -ENOMEM;
 276
 277         spin_lock(&node_affinity.lock);
 278         entry = node_affinity_lookup(dd->node);
 279         spin_unlock(&node_affinity.lock);
 280
 281         switch (msix->type) {
 282         case IRQ_SDMA:
 283                 sde = (struct sdma_engine *)msix->arg;
 284                 scnprintf(extra, 64, "engine %u", sde->this_idx);
 285                 /* fall through */
 286         case IRQ_GENERAL:
 287                 set = &entry->def_intr;
 288                 break;
 289         case IRQ_RCVCTXT:
 290                 rcd = (struct hfi1_ctxtdata *)msix->arg;
 291                 if (rcd->ctxt == HFI1_CTRL_CTXT) {
 292                         set = &entry->def_intr;
 293                         cpu = cpumask_first(&set->mask);
 294                 } else {
 295                         set = &entry->rcv_intr;
 296                 }
 297                 scnprintf(extra, 64, "ctxt %u", rcd->ctxt);
 298                 break;
 299         default:
 300                 dd_dev_err(dd, "Invalid IRQ type %d\n", msix->type);
 301                 return -EINVAL;
 302         }
 303
 304         /*
 305          * The control receive context is placed on a particular CPU, which
 306          * is set above.  Skip accounting for it.  Everything else finds its
 307          * CPU here.
 308          */
 309         if (cpu == -1 && set) {
 310                 spin_lock(&node_affinity.lock);
 311                 if (cpumask_equal(&set->mask, &set->used)) {
 312                         /*
 313                          * We've used up all the CPUs, bump up the generation
 314                          * and reset the 'used' map
 315                          */
 316                         set->gen++;
 317                         cpumask_clear(&set->used);
 318                 }
 319                 cpumask_andnot(diff, &set->mask, &set->used);
 320                 cpu = cpumask_first(diff);
 321                 cpumask_set_cpu(cpu, &set->used);
 322                 spin_unlock(&node_affinity.lock);
 323         }
 324
 325         switch (msix->type) {
 326         case IRQ_SDMA:
 327                 sde->cpu = cpu;
 328                 break;
 329         case IRQ_GENERAL:
 330         case IRQ_RCVCTXT:
 331         case IRQ_OTHER:
 332                 break;
 333         }
 334
 335         cpumask_set_cpu(cpu, &msix->mask);
 336         dd_dev_info(dd, "IRQ vector: %u, type %s %s -> cpu: %d\n",
 337                     msix->msix.vector, irq_type_names[msix->type],
 338                     extra, cpu);
 339         irq_set_affinity_hint(msix->msix.vector, &msix->mask);
 340
 341         free_cpumask_var(diff);
 342         return 0;
 343 }
 344
 345 void hfi1_put_irq_affinity(struct hfi1_devdata *dd,
 346                            struct hfi1_msix_entry *msix)
 347 {
 348         struct cpu_mask_set *set = NULL;
 349         struct hfi1_ctxtdata *rcd;
 350         struct hfi1_affinity_node *entry;
 351
 352         spin_lock(&node_affinity.lock);
 353         entry = node_affinity_lookup(dd->node);
 354         spin_unlock(&node_affinity.lock);
 355
 356         switch (msix->type) {
 357         case IRQ_SDMA:
 358         case IRQ_GENERAL:
 359                 set = &entry->def_intr;
 360                 break;
 361         case IRQ_RCVCTXT:
 362                 rcd = (struct hfi1_ctxtdata *)msix->arg;
 363                 /* only do accounting for non control contexts */
 364                 if (rcd->ctxt != HFI1_CTRL_CTXT)
 365                         set = &entry->rcv_intr;
 366                 break;
 367         default:
 368                 return;
 369         }
 370
 371         if (set) {
 372                 spin_lock(&node_affinity.lock);
 373                 cpumask_andnot(&set->used, &set->used, &msix->mask);
 374                 if (cpumask_empty(&set->used) && set->gen) {
 375                         set->gen--;
 376                         cpumask_copy(&set->used, &set->mask);
 377                 }
 378                 spin_unlock(&node_affinity.lock);
 379         }
 380
 381         irq_set_affinity_hint(msix->msix.vector, NULL);
 382         cpumask_clear(&msix->mask);
 383 }
 384
 385 int hfi1_get_proc_affinity(struct hfi1_devdata *dd, int node)
 386 {
 387         int cpu = -1, ret;
 388         cpumask_var_t diff, mask, intrs;
 389         struct hfi1_affinity_node *entry;
 390         const struct cpumask *node_mask,
 391                 *proc_mask = tsk_cpus_allowed(current);
 392         struct cpu_mask_set *set = &node_affinity.proc;
 393
 394         /*
 395          * check whether process/context affinity has already
 396          * been set
 397          */
 398         if (cpumask_weight(proc_mask) == 1) {
 399                 hfi1_cdbg(PROC, "PID %u %s affinity set to CPU %*pbl",
 400                           current->pid, current->comm,
 401                           cpumask_pr_args(proc_mask));
 402                 /*
 403                  * Mark the pre-set CPU as used. This is atomic so we don't
 404                  * need the lock
 405                  */
 406                 cpu = cpumask_first(proc_mask);
 407                 cpumask_set_cpu(cpu, &set->used);
 408                 goto done;
 409         } else if (cpumask_weight(proc_mask) < cpumask_weight(&set->mask)) {
 410                 hfi1_cdbg(PROC, "PID %u %s affinity set to CPU set(s) %*pbl",
 411                           current->pid, current->comm,
 412                           cpumask_pr_args(proc_mask));
 413                 goto done;
 414         }
 415
 416         /*
 417          * The process does not have a preset CPU affinity so find one to
 418          * recommend. We prefer CPUs on the same NUMA as the device.
 419          */
 420
 421         ret = zalloc_cpumask_var(&diff, GFP_KERNEL);
 422         if (!ret)
 423                 goto done;
 424         ret = zalloc_cpumask_var(&mask, GFP_KERNEL);
 425         if (!ret)
 426                 goto free_diff;
 427         ret = zalloc_cpumask_var(&intrs, GFP_KERNEL);
 428         if (!ret)
 429                 goto free_mask;
 430
 431         spin_lock(&node_affinity.lock);
 432         /*
 433          * If we've used all available CPUs, clear the mask and start
 434          * overloading.
 435          */
 436         if (cpumask_equal(&set->mask, &set->used)) {
 437                 set->gen++;
 438                 cpumask_clear(&set->used);
 439         }
 440
 441         entry = node_affinity_lookup(dd->node);
 442         /* CPUs used by interrupt handlers */
 443         cpumask_copy(intrs, (entry->def_intr.gen ?
 444                              &entry->def_intr.mask :
 445                              &entry->def_intr.used));
 446         cpumask_or(intrs, intrs, (entry->rcv_intr.gen ?
 447                                   &entry->rcv_intr.mask :
 448                                   &entry->rcv_intr.used));
 449         hfi1_cdbg(PROC, "CPUs used by interrupts: %*pbl",
 450                   cpumask_pr_args(intrs));
 451
 452         /*
 453          * If we don't have a NUMA node requested, preference is towards
 454          * device NUMA node
 455          */
 456         if (node == -1)
 457                 node = dd->node;
 458         node_mask = cpumask_of_node(node);
 459         hfi1_cdbg(PROC, "device on NUMA %u, CPUs %*pbl", node,
 460                   cpumask_pr_args(node_mask));
 461
 462         /* diff will hold all unused cpus */
 463         cpumask_andnot(diff, &set->mask, &set->used);
 464         hfi1_cdbg(PROC, "unused CPUs (all) %*pbl", cpumask_pr_args(diff));
 465
 466         /* get cpumask of available CPUs on preferred NUMA */
 467         cpumask_and(mask, diff, node_mask);
 468         hfi1_cdbg(PROC, "available cpus on NUMA %*pbl", cpumask_pr_args(mask));
 469
 470         /*
 471          * At first, we don't want to place processes on the same
 472          * CPUs as interrupt handlers.
 473          */
 474         cpumask_andnot(diff, mask, intrs);
 475         if (!cpumask_empty(diff))
 476                 cpumask_copy(mask, diff);
 477
 478         /*
 479          * if we don't have a cpu on the preferred NUMA, get
 480          * the list of the remaining available CPUs
 481          */
 482         if (cpumask_empty(mask)) {
 483                 cpumask_andnot(diff, &set->mask, &set->used);
 484                 cpumask_andnot(mask, diff, node_mask);
 485         }
 486         hfi1_cdbg(PROC, "possible CPUs for process %*pbl",
 487                   cpumask_pr_args(mask));
 488
 489         cpu = cpumask_first(mask);
 490         if (cpu >= nr_cpu_ids) /* empty */
 491                 cpu = -1;
 492         else
 493                 cpumask_set_cpu(cpu, &set->used);
 494         spin_unlock(&node_affinity.lock);
 495
 496         free_cpumask_var(intrs);
 497 free_mask:
 498         free_cpumask_var(mask);
 499 free_diff:
 500         free_cpumask_var(diff);
 501 done:
 502         return cpu;
 503 }
 504
 505 void hfi1_put_proc_affinity(struct hfi1_devdata *dd, int cpu)
 506 {
 507         struct cpu_mask_set *set = &node_affinity.proc;
 508
 509         if (cpu < 0)
 510                 return;
 511         spin_lock(&node_affinity.lock);
 512         cpumask_clear_cpu(cpu, &set->used);
 513         if (cpumask_empty(&set->used) && set->gen) {
 514                 set->gen--;
 515                 cpumask_copy(&set->used, &set->mask);
 516         }
 517         spin_unlock(&node_affinity.lock);
 518 }
 519