2 * Copyright (c) 2014 VMware, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
29 #include "PacketParser.h"
31 #pragma warning( push )
32 #pragma warning( disable:4127 )
38 #define OVS_DBG_MOD OVS_DBG_VXLAN
41 /* Helper macro to check if a VXLAN ID is valid. */
42 #define VXLAN_ID_IS_VALID(vxlanID) (0 < (vxlanID) && (vxlanID) <= 0xffffff)
43 #define VXLAN_TUNNELID_TO_VNI(_tID) (UINT32)(((UINT64)(_tID)) >> 40)
44 #define VXLAN_VNI_TO_TUNNELID(_vni) (((UINT64)(_vni)) << 40)
45 #define IP_DF_NBO 0x0040
46 #define VXLAN_DEFAULT_TTL 64
47 #define VXLAN_MULTICAST_TTL 64
48 #define VXLAN_DEFAULT_INSTANCE_ID 1
50 /* Move to a header file */
51 extern POVS_SWITCH_CONTEXT gOvsSwitchContext;
54 *----------------------------------------------------------------------------
55 * This function verifies if the VXLAN tunnel already exists, in order to
56 * avoid sending a duplicate request to the WFP base filtering engine.
57 *----------------------------------------------------------------------------
60 OvsIsTunnelFilterCreated(POVS_SWITCH_CONTEXT switchContext,
63 for (UINT hash = 0; hash < OVS_MAX_VPORT_ARRAY_SIZE; hash++) {
64 PLIST_ENTRY head, link, next;
66 head = &(switchContext->portNoHashArray[hash & OVS_VPORT_MASK]);
67 LIST_FORALL_SAFE(head, link, next) {
68 POVS_VPORT_ENTRY vport = NULL;
69 POVS_VXLAN_VPORT vxlanPort = NULL;
70 vport = CONTAINING_RECORD(link, OVS_VPORT_ENTRY, portNoLink);
71 vxlanPort = (POVS_VXLAN_VPORT)vport->priv;
73 if ((udpPortDest == vxlanPort->dstPort)) {
74 /* The VXLAN tunnel was already created. */
85 *----------------------------------------------------------------------------
86 * This function allocates and initializes the OVS_VXLAN_VPORT. The function
87 * also creates a WFP tunnel filter for the necessary destination port. The
88 * tunnel filter create request is passed to the tunnel filter threads that
89 * will complete the request at a later time when IRQL is lowered to
92 * udpDestPort: the vxlan is set as payload to a udp frame. If the destination
93 * port of an udp frame is udpDestPort, we understand it to be vxlan.
94 *----------------------------------------------------------------------------
97 OvsInitVxlanTunnel(PIRP irp,
98 POVS_VPORT_ENTRY vport,
100 PFNTunnelVportPendingOp callback,
103 NTSTATUS status = STATUS_SUCCESS;
104 POVS_VXLAN_VPORT vxlanPort = NULL;
106 vxlanPort = OvsAllocateMemoryWithTag(sizeof (*vxlanPort),
108 if (vxlanPort == NULL) {
109 return STATUS_INSUFFICIENT_RESOURCES;
112 RtlZeroMemory(vxlanPort, sizeof(*vxlanPort));
113 vxlanPort->dstPort = udpDestPort;
114 vport->priv = (PVOID)vxlanPort;
116 if (!OvsIsTunnelFilterCreated(gOvsSwitchContext, udpDestPort)) {
117 status = OvsTunnelFilterCreate(irp,
119 &vxlanPort->filterID,
123 status = STATUS_OBJECT_NAME_EXISTS;
130 *----------------------------------------------------------------------------
131 * This function releases the OVS_VXLAN_VPORT. The function also deletes the
132 * WFP tunnel filter previously created. The tunnel filter delete request is
133 * passed to the tunnel filter threads that will complete the request at a
134 * later time when IRQL is lowered to PASSIVE_LEVEL.
135 *----------------------------------------------------------------------------
138 OvsCleanupVxlanTunnel(PIRP irp,
139 POVS_VPORT_ENTRY vport,
140 PFNTunnelVportPendingOp callback,
143 NTSTATUS status = STATUS_SUCCESS;
144 POVS_VXLAN_VPORT vxlanPort = NULL;
146 if (vport->ovsType != OVS_VPORT_TYPE_VXLAN ||
147 vport->priv == NULL) {
148 return STATUS_SUCCESS;
151 vxlanPort = (POVS_VXLAN_VPORT)vport->priv;
153 if (vxlanPort->filterID != 0) {
154 status = OvsTunnelFilterDelete(irp,
159 OvsFreeMemoryWithTag(vport->priv, OVS_VXLAN_POOL_TAG);
168 *----------------------------------------------------------------------------
170 * Encapsulates the packet.
171 *----------------------------------------------------------------------------
173 static __inline NDIS_STATUS
174 OvsDoEncapVxlan(POVS_VPORT_ENTRY vport,
175 PNET_BUFFER_LIST curNbl,
176 OvsIPv4TunnelKey *tunKey,
177 POVS_FWD_INFO fwdInfo,
178 POVS_PACKET_HDR_INFO layers,
179 POVS_SWITCH_CONTEXT switchContext,
180 PNET_BUFFER_LIST *newNbl)
190 POVS_VXLAN_VPORT vportVxlan;
191 UINT32 headRoom = OvsGetVxlanTunHdrSize();
196 * XXX: the assumption currently is that the NBL is owned by OVS, and
197 * headroom has already been allocated as part of allocating the NBL and
200 curNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
201 packetLength = NET_BUFFER_DATA_LENGTH(curNb);
204 NDIS_TCP_LARGE_SEND_OFFLOAD_NET_BUFFER_LIST_INFO tsoInfo;
206 tsoInfo.Value = NET_BUFFER_LIST_INFO(curNbl,
207 TcpLargeSendNetBufferListInfo);
208 switch (tsoInfo.Transmit.Type) {
209 case NDIS_TCP_LARGE_SEND_OFFLOAD_V1_TYPE:
210 mss = tsoInfo.LsoV1Transmit.MSS;
212 case NDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE:
213 mss = tsoInfo.LsoV2Transmit.MSS;
216 OVS_LOG_ERROR("Unknown LSO transmit type:%d",
217 tsoInfo.Transmit.Type);
219 OVS_LOG_TRACE("MSS %u packet len %u", mss,
222 OVS_LOG_TRACE("l4Offset %d", layers->l4Offset);
223 *newNbl = OvsTcpSegmentNBL(switchContext, curNbl, layers,
225 if (*newNbl == NULL) {
226 OVS_LOG_ERROR("Unable to segment NBL");
227 return NDIS_STATUS_FAILURE;
229 /* Clear out LSO flags after this point */
230 NET_BUFFER_LIST_INFO(*newNbl, TcpLargeSendNetBufferListInfo) = 0;
234 vportVxlan = (POVS_VXLAN_VPORT) GetOvsVportPriv(vport);
237 /* If we didn't split the packet above, make a copy now */
238 if (*newNbl == NULL) {
239 *newNbl = OvsPartialCopyNBL(switchContext, curNbl, 0, headRoom,
241 if (*newNbl == NULL) {
242 OVS_LOG_ERROR("Unable to copy NBL");
243 return NDIS_STATUS_FAILURE;
246 * To this point we do not have VXLAN offloading.
247 * Apply defined checksums
249 curNb = NET_BUFFER_LIST_FIRST_NB(*newNbl);
250 curMdl = NET_BUFFER_CURRENT_MDL(curNb);
251 bufferStart = (PUINT8)MmGetSystemAddressForMdlSafe(curMdl, LowPagePriority);
253 status = NDIS_STATUS_RESOURCES;
257 NDIS_TCP_IP_CHECKSUM_NET_BUFFER_LIST_INFO csumInfo;
258 csumInfo.Value = NET_BUFFER_LIST_INFO(curNbl,
259 TcpIpChecksumNetBufferListInfo);
261 bufferStart += NET_BUFFER_CURRENT_MDL_OFFSET(curNb);
263 if (layers->isIPv4) {
264 IPHdr *ip = (IPHdr *)(bufferStart + layers->l3Offset);
266 if (csumInfo.Transmit.IpHeaderChecksum) {
268 ip->check = IPChecksum((UINT8 *)ip, 4 * ip->ihl, 0);
271 if (layers->isTcp && csumInfo.Transmit.TcpChecksum) {
272 UINT16 csumLength = (UINT16)(packetLength - layers->l4Offset);
273 TCPHdr *tcp = (TCPHdr *)(bufferStart + layers->l4Offset);
274 tcp->check = IPPseudoChecksum(&ip->saddr, &ip->daddr,
275 IPPROTO_TCP, csumLength);
276 tcp->check = CalculateChecksumNB(curNb, csumLength,
277 (UINT32)(layers->l4Offset));
278 } else if (layers->isUdp && csumInfo.Transmit.UdpChecksum) {
279 UINT16 csumLength = (UINT16)(packetLength - layers->l4Offset);
280 UDPHdr *udp = (UDPHdr *)((PCHAR)ip + sizeof *ip);
281 udp->check = IPPseudoChecksum(&ip->saddr, &ip->daddr,
282 IPPROTO_UDP, csumLength);
283 udp->check = CalculateChecksumNB(curNb, csumLength,
284 (UINT32)(layers->l4Offset));
286 } else if (layers->isIPv6) {
287 IPv6Hdr *ip = (IPv6Hdr *)(bufferStart + layers->l3Offset);
289 if (layers->isTcp && csumInfo.Transmit.TcpChecksum) {
290 UINT16 csumLength = (UINT16)(packetLength - layers->l4Offset);
291 TCPHdr *tcp = (TCPHdr *)(bufferStart + layers->l4Offset);
292 tcp->check = IPv6PseudoChecksum((UINT32 *) &ip->saddr,
293 (UINT32 *) &ip->daddr,
294 IPPROTO_TCP, csumLength);
295 tcp->check = CalculateChecksumNB(curNb, csumLength,
296 (UINT32)(layers->l4Offset));
297 } else if (layers->isUdp && csumInfo.Transmit.UdpChecksum) {
298 UINT16 csumLength = (UINT16)(packetLength - layers->l4Offset);
299 UDPHdr *udp = (UDPHdr *)((PCHAR)ip + sizeof *ip);
300 udp->check = IPv6PseudoChecksum((UINT32 *) &ip->saddr,
301 (UINT32 *) &ip->daddr,
302 IPPROTO_UDP, csumLength);
303 udp->check = CalculateChecksumNB(curNb, csumLength,
304 (UINT32)(layers->l4Offset));
307 /* Clear out TcpIpChecksumNetBufferListInfo flag */
308 NET_BUFFER_LIST_INFO(*newNbl, TcpIpChecksumNetBufferListInfo) = 0;
312 for (curNb = NET_BUFFER_LIST_FIRST_NB(curNbl); curNb != NULL;
313 curNb = curNb->Next) {
314 status = NdisRetreatNetBufferDataStart(curNb, headRoom, 0, NULL);
315 if (status != NDIS_STATUS_SUCCESS) {
319 curMdl = NET_BUFFER_CURRENT_MDL(curNb);
320 bufferStart = (PUINT8)MmGetSystemAddressForMdlSafe(curMdl, LowPagePriority);
322 status = NDIS_STATUS_RESOURCES;
326 bufferStart += NET_BUFFER_CURRENT_MDL_OFFSET(curNb);
327 if (NET_BUFFER_NEXT_NB(curNb)) {
328 OVS_LOG_TRACE("nb length %u next %u", NET_BUFFER_DATA_LENGTH(curNb),
329 NET_BUFFER_DATA_LENGTH(curNb->Next));
333 ethHdr = (EthHdr *)bufferStart;
334 ASSERT(((PCHAR)&fwdInfo->dstMacAddr + sizeof fwdInfo->dstMacAddr) ==
335 (PCHAR)&fwdInfo->srcMacAddr);
336 NdisMoveMemory(ethHdr->Destination, fwdInfo->dstMacAddr,
337 sizeof ethHdr->Destination + sizeof ethHdr->Source);
338 ethHdr->Type = htons(ETH_TYPE_IPV4);
341 ipHdr = (IPHdr *)((PCHAR)ethHdr + sizeof *ethHdr);
343 ipHdr->ihl = sizeof *ipHdr / 4;
344 ipHdr->version = IPPROTO_IPV4;
345 ipHdr->tos = tunKey->tos;
346 ipHdr->tot_len = htons(NET_BUFFER_DATA_LENGTH(curNb) - sizeof *ethHdr);
347 ipHdr->id = (uint16)atomic_add64(&vportVxlan->ipId,
348 NET_BUFFER_DATA_LENGTH(curNb));
349 ipHdr->frag_off = (tunKey->flags & OVS_TNL_F_DONT_FRAGMENT) ?
351 ipHdr->ttl = tunKey->ttl ? tunKey->ttl : VXLAN_DEFAULT_TTL;
352 ipHdr->protocol = IPPROTO_UDP;
353 ASSERT(tunKey->dst == fwdInfo->dstIpAddr);
354 ASSERT(tunKey->src == fwdInfo->srcIpAddr || tunKey->src == 0);
355 ipHdr->saddr = fwdInfo->srcIpAddr;
356 ipHdr->daddr = fwdInfo->dstIpAddr;
359 ipHdr->check = IPChecksum((UINT8 *)ipHdr, sizeof *ipHdr, 0);
362 udpHdr = (UDPHdr *)((PCHAR)ipHdr + sizeof *ipHdr);
363 udpHdr->source = htons(tunKey->flow_hash | MAXINT16);
364 udpHdr->dest = htons(vportVxlan->dstPort);
365 udpHdr->len = htons(NET_BUFFER_DATA_LENGTH(curNb) - headRoom +
366 sizeof *udpHdr + sizeof *vxlanHdr);
370 vxlanHdr = (VXLANHdr *)((PCHAR)udpHdr + sizeof *udpHdr);
371 vxlanHdr->flags1 = 0;
372 vxlanHdr->locallyReplicate = 0;
373 vxlanHdr->flags2 = 0;
374 vxlanHdr->reserved1 = 0;
375 if (tunKey->flags | OVS_TNL_F_KEY) {
376 vxlanHdr->vxlanID = VXLAN_TUNNELID_TO_VNI(tunKey->tunnelId);
377 vxlanHdr->instanceID = 1;
379 vxlanHdr->reserved2 = 0;
381 return STATUS_SUCCESS;
384 OvsCompleteNBL(switchContext, *newNbl, TRUE);
391 *----------------------------------------------------------------------------
393 * Encapsulates the packet if L2/L3 for destination resolves. Otherwise,
394 * enqueues a callback that does encapsulatation after resolution.
395 *----------------------------------------------------------------------------
398 OvsEncapVxlan(POVS_VPORT_ENTRY vport,
399 PNET_BUFFER_LIST curNbl,
400 OvsIPv4TunnelKey *tunKey,
401 POVS_SWITCH_CONTEXT switchContext,
402 POVS_PACKET_HDR_INFO layers,
403 PNET_BUFFER_LIST *newNbl)
406 OVS_FWD_INFO fwdInfo;
408 status = OvsLookupIPFwdInfo(tunKey->dst, &fwdInfo);
409 if (status != STATUS_SUCCESS) {
410 OvsFwdIPHelperRequest(NULL, 0, tunKey, NULL, NULL, NULL);
411 // return NDIS_STATUS_PENDING;
413 * XXX: Don't know if the completionList will make any sense when
414 * accessed in the callback. Make sure the caveats are known.
416 * XXX: This code will work once we are able to grab locks in the
419 return NDIS_STATUS_FAILURE;
422 return OvsDoEncapVxlan(vport, curNbl, tunKey, &fwdInfo, layers,
423 switchContext, newNbl);
427 *----------------------------------------------------------------------------
428 * OvsCalculateUDPChecksum
429 * Calculate UDP checksum
430 *----------------------------------------------------------------------------
432 static __inline NDIS_STATUS
433 OvsCalculateUDPChecksum(PNET_BUFFER_LIST curNbl,
439 NDIS_TCP_IP_CHECKSUM_NET_BUFFER_LIST_INFO csumInfo;
442 csumInfo.Value = NET_BUFFER_LIST_INFO(curNbl, TcpIpChecksumNetBufferListInfo);
444 /* Next check if UDP checksum has been calculated. */
445 if (!csumInfo.Receive.UdpChecksumSucceeded) {
448 checkSum = udpHdr->check;
450 l4Payload = packetLength - sizeof(EthHdr) - ipHdr->ihl * 4;
453 IPPseudoChecksum((UINT32 *)&ipHdr->saddr,
454 (UINT32 *)&ipHdr->daddr,
455 IPPROTO_UDP, (UINT16)l4Payload);
456 udpHdr->check = CalculateChecksumNB(curNb, (UINT16)l4Payload,
457 sizeof(EthHdr) + ipHdr->ihl * 4);
458 if (checkSum != udpHdr->check) {
459 OVS_LOG_TRACE("UDP checksum incorrect.");
460 return NDIS_STATUS_INVALID_PACKET;
464 csumInfo.Receive.UdpChecksumSucceeded = 1;
465 NET_BUFFER_LIST_INFO(curNbl, TcpIpChecksumNetBufferListInfo) = csumInfo.Value;
466 return NDIS_STATUS_SUCCESS;
470 *----------------------------------------------------------------------------
472 * Decapsulates to tunnel header in 'curNbl' and puts into 'tunKey'.
473 *----------------------------------------------------------------------------
476 OvsDecapVxlan(POVS_SWITCH_CONTEXT switchContext,
477 PNET_BUFFER_LIST curNbl,
478 OvsIPv4TunnelKey *tunKey,
479 PNET_BUFFER_LIST *newNbl)
487 UINT32 tunnelSize = 0, packetLength = 0;
491 /* Check the length of the UDP payload */
492 curNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
493 packetLength = NET_BUFFER_DATA_LENGTH(curNb);
494 tunnelSize = OvsGetVxlanTunHdrSize();
495 if (packetLength <= tunnelSize) {
496 return NDIS_STATUS_INVALID_LENGTH;
500 * Create a copy of the NBL so that we have all the headers in one MDL.
502 *newNbl = OvsPartialCopyNBL(switchContext, curNbl,
503 tunnelSize + OVS_DEFAULT_COPY_SIZE, 0,
504 TRUE /*copy NBL info */);
506 if (*newNbl == NULL) {
507 return NDIS_STATUS_RESOURCES;
510 /* XXX: Handle VLAN header. */
512 curNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
513 curMdl = NET_BUFFER_CURRENT_MDL(curNb);
514 bufferStart = (PUINT8)MmGetSystemAddressForMdlSafe(curMdl, LowPagePriority) +
515 NET_BUFFER_CURRENT_MDL_OFFSET(curNb);
517 status = NDIS_STATUS_RESOURCES;
521 ethHdr = (EthHdr *)bufferStart;
522 /* XXX: Handle IP options. */
523 ipHdr = (IPHdr *)((PCHAR)ethHdr + sizeof *ethHdr);
524 tunKey->src = ipHdr->saddr;
525 tunKey->dst = ipHdr->daddr;
526 tunKey->tos = ipHdr->tos;
527 tunKey->ttl = ipHdr->ttl;
529 udpHdr = (UDPHdr *)((PCHAR)ipHdr + sizeof *ipHdr);
531 /* Validate if NIC has indicated checksum failure. */
532 status = OvsValidateUDPChecksum(curNbl, udpHdr->check == 0);
533 if (status != NDIS_STATUS_SUCCESS) {
537 /* Calculate and verify UDP checksum if NIC didn't do it. */
538 if (udpHdr->check != 0) {
539 status = OvsCalculateUDPChecksum(curNbl, curNb, ipHdr, udpHdr, packetLength);
540 if (status != NDIS_STATUS_SUCCESS) {
545 vxlanHdr = (VXLANHdr *)((PCHAR)udpHdr + sizeof *udpHdr);
546 if (vxlanHdr->instanceID) {
547 tunKey->flags = OVS_TNL_F_KEY;
548 tunKey->tunnelId = VXLAN_VNI_TO_TUNNELID(vxlanHdr->vxlanID);
551 tunKey->tunnelId = 0;
554 /* Clear out the receive flag for the inner packet. */
555 NET_BUFFER_LIST_INFO(curNbl, TcpIpChecksumNetBufferListInfo) = 0;
556 NdisAdvanceNetBufferDataStart(curNb, tunnelSize, FALSE, NULL);
557 return NDIS_STATUS_SUCCESS;
560 OvsCompleteNBL(switchContext, *newNbl, TRUE);
567 OvsSlowPathDecapVxlan(const PNET_BUFFER_LIST packet,
568 OvsIPv4TunnelKey *tunnelKey)
570 NDIS_STATUS status = NDIS_STATUS_FAILURE;
573 VXLANHdr *VxlanHeader;
574 VXLANHdr VxlanHeaderBuffer;
575 struct IPHdr ip_storage;
576 const struct IPHdr *nh;
577 OVS_PACKET_HDR_INFO layers;
582 nh = OvsGetIp(packet, layers.l3Offset, &ip_storage);
584 layers.l4Offset = layers.l3Offset + nh->ihl * 4;
589 /* make sure it's a VXLAN packet */
590 udp = OvsGetUdp(packet, layers.l4Offset, &udpStorage);
592 layers.l7Offset = layers.l4Offset + sizeof *udp;
597 VxlanHeader = (VXLANHdr *)OvsGetPacketBytes(packet,
598 sizeof(*VxlanHeader),
603 tunnelKey->src = nh->saddr;
604 tunnelKey->dst = nh->daddr;
605 tunnelKey->ttl = nh->ttl;
606 tunnelKey->tos = nh->tos;
607 if (VxlanHeader->instanceID) {
608 tunnelKey->flags = OVS_TNL_F_KEY;
609 tunnelKey->tunnelId = VXLAN_VNI_TO_TUNNELID(VxlanHeader->vxlanID);
611 tunnelKey->flags = 0;
612 tunnelKey->tunnelId = 0;
617 status = NDIS_STATUS_SUCCESS;
624 #pragma warning( pop )