Merge tag 'gfs2-4.8.fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/gfs2...

[cascardo/linux.git] / include / linux / hyperv.h
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h

index aa0fadc..cd184bd 100644 (file)
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -126,6 +126,8 @@ struct hv_ring_buffer_info {
  
         u32 ring_datasize;              /* < ring_size */
         u32 ring_data_startoffset;
+       u32 priv_write_index;
+       u32 priv_read_index;
  };
  
  /*
@@ -151,6 +153,33 @@ hv_get_ringbuffer_availbytes(struct hv_ring_buffer_info *rbi,
         *read = dsize - *write;
  }
  
+static inline u32 hv_get_bytes_to_read(struct hv_ring_buffer_info *rbi)
+{
+       u32 read_loc, write_loc, dsize, read;
+
+       dsize = rbi->ring_datasize;
+       read_loc = rbi->ring_buffer->read_index;
+       write_loc = READ_ONCE(rbi->ring_buffer->write_index);
+
+       read = write_loc >= read_loc ? (write_loc - read_loc) :
+               (dsize - read_loc) + write_loc;
+
+       return read;
+}
+
+static inline u32 hv_get_bytes_to_write(struct hv_ring_buffer_info *rbi)
+{
+       u32 read_loc, write_loc, dsize, write;
+
+       dsize = rbi->ring_datasize;
+       read_loc = READ_ONCE(rbi->ring_buffer->read_index);
+       write_loc = rbi->ring_buffer->write_index;
+
+       write = write_loc >= read_loc ? dsize - (write_loc - read_loc) :
+               read_loc - write_loc;
+       return write;
+}
+
  /*
   * VMBUS version is 32 bit entity broken up into
   * two 16 bit quantities: major_number. minor_number.
@@ -645,6 +674,11 @@ enum hv_signal_policy {
         HV_SIGNAL_POLICY_EXPLICIT,
  };
  
+enum hv_numa_policy {
+       HV_BALANCED = 0,
+       HV_LOCALIZED,
+};
+
  enum vmbus_device_type {
         HV_IDE = 0,
         HV_SCSI,
@@ -672,9 +706,6 @@ struct vmbus_device {
  };
  
  struct vmbus_channel {
-       /* Unique channel id */
-       int id;
-
         struct list_head listentry;
  
         struct hv_device *device_obj;
@@ -821,6 +852,43 @@ struct vmbus_channel {
          * ring lock to preserve the current behavior.
          */
         bool acquire_ring_lock;
+       /*
+        * For performance critical channels (storage, networking
+        * etc,), Hyper-V has a mechanism to enhance the throughput
+        * at the expense of latency:
+        * When the host is to be signaled, we just set a bit in a shared page
+        * and this bit will be inspected by the hypervisor within a certain
+        * window and if the bit is set, the host will be signaled. The window
+        * of time is the monitor latency - currently around 100 usecs. This
+        * mechanism improves throughput by:
+        *
+        * A) Making the host more efficient - each time it wakes up,
+        *    potentially it will process morev number of packets. The
+        *    monitor latency allows a batch to build up.
+        * B) By deferring the hypercall to signal, we will also minimize
+        *    the interrupts.
+        *
+        * Clearly, these optimizations improve throughput at the expense of
+        * latency. Furthermore, since the channel is shared for both
+        * control and data messages, control messages currently suffer
+        * unnecessary latency adversley impacting performance and boot
+        * time. To fix this issue, permit tagging the channel as being
+        * in "low latency" mode. In this mode, we will bypass the monitor
+        * mechanism.
+        */
+       bool low_latency;
+
+       /*
+        * NUMA distribution policy:
+        * We support teo policies:
+        * 1) Balanced: Here all performance critical channels are
+        *    distributed evenly amongst all the NUMA nodes.
+        *    This policy will be the default policy.
+        * 2) Localized: All channels of a given instance of a
+        *    performance critical service will be assigned CPUs
+        *    within a selected NUMA node.
+        */
+       enum hv_numa_policy affinity_policy;
  
  };
  
@@ -841,6 +909,12 @@ static inline void set_channel_signal_state(struct vmbus_channel *c,
         c->signal_policy = policy;
  }
  
+static inline void set_channel_affinity_state(struct vmbus_channel *c,
+                                             enum hv_numa_policy policy)
+{
+       c->affinity_policy = policy;
+}
+
  static inline void set_channel_read_state(struct vmbus_channel *c, bool state)
  {
         c->batched_reading = state;
@@ -862,6 +936,16 @@ static inline void set_channel_pending_send_size(struct vmbus_channel *c,
         c->outbound.ring_buffer->pending_send_sz = size;
  }
  
+static inline void set_low_latency_mode(struct vmbus_channel *c)
+{
+       c->low_latency = true;
+}
+
+static inline void clear_low_latency_mode(struct vmbus_channel *c)
+{
+       c->low_latency = false;
+}
+
  void vmbus_onmessage(void *context);
  
  int vmbus_request_offers(void);
@@ -1091,7 +1175,7 @@ int vmbus_allocate_mmio(struct resource **new, struct hv_device *device_obj,
                         resource_size_t min, resource_size_t max,
                         resource_size_t size, resource_size_t align,
                         bool fb_overlap_ok);
-
+void vmbus_free_mmio(resource_size_t start, resource_size_t size);
  int vmbus_cpu_number_to_vp_number(int cpu_number);
  u64 hv_do_hypercall(u64 control, void *input, void *output);
  
@@ -1227,6 +1311,27 @@ u64 hv_do_hypercall(u64 control, void *input, void *output);
         .guid = UUID_LE(0x44c4f61d, 0x4444, 0x4400, 0x9d, 0x52, \
                         0x80, 0x2e, 0x27, 0xed, 0xe1, 0x9f)
  
+/*
+ * Linux doesn't support the 3 devices: the first two are for
+ * Automatic Virtual Machine Activation, and the third is for
+ * Remote Desktop Virtualization.
+ * {f8e65716-3cb3-4a06-9a60-1889c5cccab5}
+ * {3375baf4-9e15-4b30-b765-67acb10d607b}
+ * {276aacf4-ac15-426c-98dd-7521ad3f01fe}
+ */
+
+#define HV_AVMA1_GUID \
+       .guid = UUID_LE(0xf8e65716, 0x3cb3, 0x4a06, 0x9a, 0x60, \
+                       0x18, 0x89, 0xc5, 0xcc, 0xca, 0xb5)
+
+#define HV_AVMA2_GUID \
+       .guid = UUID_LE(0x3375baf4, 0x9e15, 0x4b30, 0xb7, 0x65, \
+                       0x67, 0xac, 0xb1, 0x0d, 0x60, 0x7b)
+
+#define HV_RDV_GUID \
+       .guid = UUID_LE(0x276aacf4, 0xac15, 0x426c, 0x98, 0xdd, \
+                       0x75, 0x21, 0xad, 0x3f, 0x01, 0xfe)
+
  /*
   * Common header for Hyper-V ICs
   */
@@ -1315,6 +1420,15 @@ struct ictimesync_data {
         u8 flags;
  } __packed;
  
+struct ictimesync_ref_data {
+       u64 parenttime;
+       u64 vmreferencetime;
+       u8 flags;
+       char leapflags;
+       char stratum;
+       u8 reserved[3];
+} __packed;
+
  struct hyperv_service_callback {
         u8 msg_type;
         char *log_msg;
@@ -1328,6 +1442,9 @@ extern bool vmbus_prep_negotiate_resp(struct icmsg_hdr *,
                                         struct icmsg_negotiate *, u8 *, int,
                                         int);
  
+void hv_event_tasklet_disable(struct vmbus_channel *channel);
+void hv_event_tasklet_enable(struct vmbus_channel *channel);
+
  void hv_process_channel_removal(struct vmbus_channel *channel, u32 relid);
  
  /*
@@ -1338,4 +1455,143 @@ extern __u32 vmbus_proto_version;
  
  int vmbus_send_tl_connect_request(const uuid_le *shv_guest_servie_id,
                                   const uuid_le *shv_host_servie_id);
+void vmbus_set_event(struct vmbus_channel *channel);
+
+/* Get the start of the ring buffer. */
+static inline void *
+hv_get_ring_buffer(struct hv_ring_buffer_info *ring_info)
+{
+       return (void *)ring_info->ring_buffer->buffer;
+}
+
+/*
+ * To optimize the flow management on the send-side,
+ * when the sender is blocked because of lack of
+ * sufficient space in the ring buffer, potential the
+ * consumer of the ring buffer can signal the producer.
+ * This is controlled by the following parameters:
+ *
+ * 1. pending_send_sz: This is the size in bytes that the
+ *    producer is trying to send.
+ * 2. The feature bit feat_pending_send_sz set to indicate if
+ *    the consumer of the ring will signal when the ring
+ *    state transitions from being full to a state where
+ *    there is room for the producer to send the pending packet.
+ */
+
+static inline  bool hv_need_to_signal_on_read(struct hv_ring_buffer_info *rbi)
+{
+       u32 cur_write_sz;
+       u32 pending_sz;
+
+       /*
+        * Issue a full memory barrier before making the signaling decision.
+        * Here is the reason for having this barrier:
+        * If the reading of the pend_sz (in this function)
+        * were to be reordered and read before we commit the new read
+        * index (in the calling function)  we could
+        * have a problem. If the host were to set the pending_sz after we
+        * have sampled pending_sz and go to sleep before we commit the
+        * read index, we could miss sending the interrupt. Issue a full
+        * memory barrier to address this.
+        */
+       virt_mb();
+
+       pending_sz = READ_ONCE(rbi->ring_buffer->pending_send_sz);
+       /* If the other end is not blocked on write don't bother. */
+       if (pending_sz == 0)
+               return false;
+
+       cur_write_sz = hv_get_bytes_to_write(rbi);
+
+       if (cur_write_sz >= pending_sz)
+               return true;
+
+       return false;
+}
+
+/*
+ * An API to support in-place processing of incoming VMBUS packets.
+ */
+#define VMBUS_PKT_TRAILER      8
+
+static inline struct vmpacket_descriptor *
+get_next_pkt_raw(struct vmbus_channel *channel)
+{
+       struct hv_ring_buffer_info *ring_info = &channel->inbound;
+       u32 read_loc = ring_info->priv_read_index;
+       void *ring_buffer = hv_get_ring_buffer(ring_info);
+       struct vmpacket_descriptor *cur_desc;
+       u32 packetlen;
+       u32 dsize = ring_info->ring_datasize;
+       u32 delta = read_loc - ring_info->ring_buffer->read_index;
+       u32 bytes_avail_toread = (hv_get_bytes_to_read(ring_info) - delta);
+
+       if (bytes_avail_toread < sizeof(struct vmpacket_descriptor))
+               return NULL;
+
+       if ((read_loc + sizeof(*cur_desc)) > dsize)
+               return NULL;
+
+       cur_desc = ring_buffer + read_loc;
+       packetlen = cur_desc->len8 << 3;
+
+       /*
+        * If the packet under consideration is wrapping around,
+        * return failure.
+        */
+       if ((read_loc + packetlen + VMBUS_PKT_TRAILER) > (dsize - 1))
+               return NULL;
+
+       return cur_desc;
+}
+
+/*
+ * A helper function to step through packets "in-place"
+ * This API is to be called after each successful call
+ * get_next_pkt_raw().
+ */
+static inline void put_pkt_raw(struct vmbus_channel *channel,
+                               struct vmpacket_descriptor *desc)
+{
+       struct hv_ring_buffer_info *ring_info = &channel->inbound;
+       u32 read_loc = ring_info->priv_read_index;
+       u32 packetlen = desc->len8 << 3;
+       u32 dsize = ring_info->ring_datasize;
+
+       if ((read_loc + packetlen + VMBUS_PKT_TRAILER) > dsize)
+               BUG();
+       /*
+        * Include the packet trailer.
+        */
+       ring_info->priv_read_index += packetlen + VMBUS_PKT_TRAILER;
+}
+
+/*
+ * This call commits the read index and potentially signals the host.
+ * Here is the pattern for using the "in-place" consumption APIs:
+ *
+ * while (get_next_pkt_raw() {
+ *     process the packet "in-place";
+ *     put_pkt_raw();
+ * }
+ * if (packets processed in place)
+ *     commit_rd_index();
+ */
+static inline void commit_rd_index(struct vmbus_channel *channel)
+{
+       struct hv_ring_buffer_info *ring_info = &channel->inbound;
+       /*
+        * Make sure all reads are done before we update the read index since
+        * the writer may start writing to the read area once the read index
+        * is updated.
+        */
+       virt_rmb();
+       ring_info->ring_buffer->read_index = ring_info->priv_read_index;
+
+       if (hv_need_to_signal_on_read(ring_info))
+               vmbus_set_event(channel);
+}
+
+
  #endif /* _HYPERV_H */