From 2e7cf69a4b8698a4f804ba68101a19f9795d3f8e Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Tue, 6 Jan 2026 12:12:53 +0000 Subject: [PATCH 1/4] Attached (External) Subnets This PR reworks certain aspects of how NAT and gateway layers are constructed to enable attached (external) subnets to function. The primary aim here is that traffic to/from these subnets (owned by the host) do not undergo NAT, and bypass spoof detection as transit IPs would. A few wider changes have been necessary to ensure that these can be attached/detached without breaking any existing transit IPs, and to ensure that traffic originated from an external subnet cannot be directed towards a private VPC recipient. --- .github/buildomat/jobs/opte-api.sh | 2 +- .github/buildomat/jobs/opte-ioctl.sh | 2 +- .github/buildomat/jobs/opte.sh | 4 +- .github/buildomat/jobs/opteadm.sh | 2 +- .github/buildomat/jobs/oxide-vpc.sh | 2 +- .github/buildomat/jobs/xde.sh | 2 +- bench/src/packet.rs | 14 +- bin/opteadm/src/bin/opteadm.rs | 48 ++- crates/opte-api/src/cmd.rs | 125 +++++-- crates/opte-api/src/lib.rs | 4 +- lib/opte-ioctl/src/lib.rs | 41 ++- lib/opte-test-utils/src/lib.rs | 28 +- lib/opte/src/dynamic.rs | 18 +- lib/opte/src/engine/layer.rs | 10 +- lib/opte/src/engine/nat.rs | 65 +++- lib/opte/src/engine/port/meta.rs | 61 +++- lib/opte/src/engine/predicate.rs | 8 +- lib/opte/src/engine/rule.rs | 11 +- lib/opte/src/engine/snat.rs | 29 +- lib/oxide-vpc/src/api.rs | 76 ++++- lib/oxide-vpc/src/cfg.rs | 31 ++ lib/oxide-vpc/src/engine/firewall.rs | 12 +- lib/oxide-vpc/src/engine/gateway/arp.rs | 14 +- lib/oxide-vpc/src/engine/gateway/dhcp.rs | 32 +- lib/oxide-vpc/src/engine/gateway/dhcpv6.rs | 24 +- lib/oxide-vpc/src/engine/gateway/icmp.rs | 17 +- lib/oxide-vpc/src/engine/gateway/icmpv6.rs | 46 ++- lib/oxide-vpc/src/engine/gateway/mod.rs | 195 ++++++++--- lib/oxide-vpc/src/engine/gateway/transit.rs | 98 ++++-- lib/oxide-vpc/src/engine/geneve.rs | 4 +- lib/oxide-vpc/src/engine/nat.rs | 202 +++++++++-- lib/oxide-vpc/src/engine/overlay.rs | 131 ++++---- lib/oxide-vpc/src/engine/router.rs | 22 +- lib/oxide-vpc/tests/firewall_tests.rs | 8 +- lib/oxide-vpc/tests/integration_tests.rs | 355 ++++++++++++++++++-- xde-tests/src/lib.rs | 15 +- xde/src/xde.rs | 78 ++++- 37 files changed, 1404 insertions(+), 432 deletions(-) diff --git a/.github/buildomat/jobs/opte-api.sh b/.github/buildomat/jobs/opte-api.sh index 52e6c43c..8b2fd764 100755 --- a/.github/buildomat/jobs/opte-api.sh +++ b/.github/buildomat/jobs/opte-api.sh @@ -28,7 +28,7 @@ header "analyze std" ptime -m cargo clippy --all-targets header "analyze no_std" -ptime -m cargo clippy --no-default-features --all-targets +ptime -m cargo clippy --no-default-features --all-targets -- --deny warnings header "test" ptime -m cargo test diff --git a/.github/buildomat/jobs/opte-ioctl.sh b/.github/buildomat/jobs/opte-ioctl.sh index b0363aaa..6340ebab 100755 --- a/.github/buildomat/jobs/opte-ioctl.sh +++ b/.github/buildomat/jobs/opte-ioctl.sh @@ -22,4 +22,4 @@ header "check style" ptime -m cargo +$NIGHTLY fmt -- --check header "analyze" -ptime -m cargo clippy --all-targets +ptime -m cargo clippy --all-targets -- --deny warnings diff --git a/.github/buildomat/jobs/opte.sh b/.github/buildomat/jobs/opte.sh index 7885d01b..82734005 100755 --- a/.github/buildomat/jobs/opte.sh +++ b/.github/buildomat/jobs/opte.sh @@ -31,10 +31,10 @@ RUSTDOCFLAGS="-D warnings" ptime -m \ cargo +$NIGHTLY doc --no-default-features --features=api,std,engine,kernel header "analyze std + api" -ptime -m cargo clippy --all-targets +ptime -m cargo clippy --all-targets -- --deny warnings header "analyze no_std + engine + kernel" -ptime -m cargo +$NIGHTLY clippy --no-default-features --features engine,kernel +ptime -m cargo +$NIGHTLY clippy --no-default-features --features engine,kernel -- --deny warnings header "test" ptime -m cargo test diff --git a/.github/buildomat/jobs/opteadm.sh b/.github/buildomat/jobs/opteadm.sh index f533d38c..1bfc523a 100755 --- a/.github/buildomat/jobs/opteadm.sh +++ b/.github/buildomat/jobs/opteadm.sh @@ -31,7 +31,7 @@ header "check style" ptime -m cargo +$NIGHTLY fmt -- --check header "analyze" -ptime -m cargo clippy --all-targets +ptime -m cargo clippy --all-targets -- --deny warnings header "debug build" ptime -m cargo build diff --git a/.github/buildomat/jobs/oxide-vpc.sh b/.github/buildomat/jobs/oxide-vpc.sh index bde40131..9e9638a4 100755 --- a/.github/buildomat/jobs/oxide-vpc.sh +++ b/.github/buildomat/jobs/oxide-vpc.sh @@ -31,7 +31,7 @@ RUSTDOCFLAGS="-D warnings" ptime -m \ cargo +$NIGHTLY doc --no-default-features --features=api,std,engine,kernel header "analyze std + api + usdt" -ptime -m cargo clippy --features usdt --all-targets +ptime -m cargo clippy --features usdt --all-targets -- --deny warnings header "analyze no_std + engine + kernel" ptime -m cargo +$NIGHTLY clippy --no-default-features --features engine,kernel diff --git a/.github/buildomat/jobs/xde.sh b/.github/buildomat/jobs/xde.sh index 82baf11c..7f34b1d9 100755 --- a/.github/buildomat/jobs/xde.sh +++ b/.github/buildomat/jobs/xde.sh @@ -113,7 +113,7 @@ sha256sum $REL_TGT/xde_link.so > $REL_TGT/xde_link.so.sha256 header "build xde integration tests" pushd xde-tests cargo +$NIGHTLY fmt -- --check -cargo clippy --all-targets +cargo clippy --all-targets -- --deny warnings cargo build --test loopback loopback_test=$( cargo build -q --test loopback --message-format=json |\ diff --git a/bench/src/packet.rs b/bench/src/packet.rs index b555f946..0788ca60 100644 --- a/bench/src/packet.rs +++ b/bench/src/packet.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -// Copyright 2024 Oxide Computer Company +// Copyright 2026 Oxide Computer Company use opte::ddi::mblk::MsgBlk; use opte::engine::Direction; @@ -26,6 +26,7 @@ use opte_test_utils::icmp::gen_icmp_echo; use opte_test_utils::icmp::gen_icmpv6_echo; use opte_test_utils::icmp::generate_ndisc; use opte_test_utils::*; +use std::collections::BTreeMap; pub type TestCase = (MsgBlk, Direction); @@ -91,6 +92,8 @@ impl BenchPacket for UlpProcess { ephemeral_ip: Some("10.60.1.20".parse().unwrap()), floating_ips: vec![], }, + attached_subnets: BTreeMap::default(), + transit_ips: BTreeMap::default(), }, ipv6: Ipv6Cfg { vpc_subnet: "fd00::/64".parse().unwrap(), @@ -104,6 +107,8 @@ impl BenchPacket for UlpProcess { ephemeral_ip: Some("2001:db8::2".parse().unwrap()), floating_ips: vec![], }, + attached_subnets: BTreeMap::default(), + transit_ips: BTreeMap::default(), }, }; @@ -269,18 +274,13 @@ impl BenchPacketInstance for UlpProcessInstance { let out_pkt = match self.direction { Direction::Out => inner_pkt, Direction::In => { - let bsvc_phys = TestIpPhys { - ip: BS_IP_ADDR, - mac: BS_MAC_ADDR, - vni: Vni::new(BOUNDARY_SERVICES_VNI).unwrap(), - }; let guest_phys = TestIpPhys { ip: self.cfg.phys_ip, mac: self.cfg.guest_mac, vni: self.cfg.vni, }; - encap_external(inner_pkt, bsvc_phys, guest_phys) + encap_external(inner_pkt, *BSVC_PHYS, guest_phys) } }; diff --git a/bin/opteadm/src/bin/opteadm.rs b/bin/opteadm/src/bin/opteadm.rs index fb4334db..a177dbb7 100644 --- a/bin/opteadm/src/bin/opteadm.rs +++ b/bin/opteadm/src/bin/opteadm.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -// Copyright 2025 Oxide Computer Company +// Copyright 2026 Oxide Computer Company use anyhow::Context; use clap::Args; @@ -70,6 +70,7 @@ use oxide_vpc::print::print_mcast_fwd; use oxide_vpc::print::print_mcast_subs; use oxide_vpc::print::print_v2b; use oxide_vpc::print::print_v2p; +use std::collections::BTreeMap; use std::io; use std::io::Write; use std::str::FromStr; @@ -403,6 +404,36 @@ enum Command { #[arg(long = "dir")] direction: Option, }, + + /// Give a guest ownership of a given CIDR block. + /// + /// This is equivalent to a bidirectional `AllowCidr`, with an exemption + /// from NAT if the subnet is marked as `external`. + /// + /// Repeated calls on any given `prefix` will update its configuration. + AttachSubnet { + /// The OPTE port to configure. + #[arg(short)] + port: String, + + /// The subnet to attach. + prefix: IpCidr, + + /// Marks the subnet as a block of external IPs for which in/outbound + /// NAT should not be performed. + #[arg(long, short)] + external: bool, + }, + + /// Rescind a guest's ownership of a given CIDR block. + DetachSubnet { + /// The OPTE port to configure. + #[arg(short)] + port: String, + + /// The subnet to detach. + prefix: IpCidr, + }, } #[derive(Debug, Parser)] @@ -805,6 +836,8 @@ fn main() -> anyhow::Result<()> { private_ip, gateway_ip, external_ips, + attached_subnets: BTreeMap::new(), + transit_ips: BTreeMap::new(), }) } IpAddr::Ip6(private_ip) => { @@ -823,6 +856,8 @@ fn main() -> anyhow::Result<()> { private_ip, gateway_ip, external_ips, + attached_subnets: BTreeMap::new(), + transit_ips: BTreeMap::new(), }) } }; @@ -833,9 +868,10 @@ fn main() -> anyhow::Result<()> { gateway_mac, vni: vpc_vni, phys_ip: src_underlay_addr, + dhcp: dhcp.into(), }; - hdl.create_xde(&name, cfg, dhcp.into(), passthrough)?; + hdl.create_xde(&name, cfg, passthrough)?; } Command::DeleteXde { name } => { @@ -1054,6 +1090,14 @@ fn main() -> anyhow::Result<()> { })?; } } + + Command::AttachSubnet { port, prefix, external } => { + hdl.attach_subnet(&port, prefix, external)?; + } + + Command::DetachSubnet { port, prefix } => { + hdl.detach_subnet(&port, prefix)?; + } } Ok(()) diff --git a/crates/opte-api/src/cmd.rs b/crates/opte-api/src/cmd.rs index d69a0a8a..8238511e 100644 --- a/crates/opte-api/src/cmd.rs +++ b/crates/opte-api/src/cmd.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -// Copyright 2025 Oxide Computer Company +// Copyright 2026 Oxide Computer Company use super::API_VERSION; use super::RuleId; @@ -25,40 +25,95 @@ pub const XDE_IOC_OPTE_CMD: i32 = XDE_IOC as i32 | 0x01; #[derive(Clone, Copy, Debug)] #[repr(C)] pub enum OpteCmd { - ListPorts = 1, // list all ports - AddFwRule = 20, // add firewall rule - RemFwRule = 21, // remove firewall rule - SetFwRules = 22, // set/replace all firewall rules at once - DumpTcpFlows = 30, // dump TCP flows - DumpLayer = 31, // dump the specified Layer - DumpUft = 32, // dump the Unified Flow Table - ListLayers = 33, // list the layers on a given port - ClearUft = 40, // clear the UFT - ClearLft = 41, // clear the given Layer's Flow Table - SetVirt2Phys = 50, // set a v2p mapping - DumpVirt2Phys = 51, // dump the v2p mappings - SetVirt2Boundary = 52, // set a v2b mapping - ClearVirt2Boundary = 53, // clear a v2b mapping - DumpVirt2Boundary = 54, // dump the v2b mappings - ClearVirt2Phys = 55, // clear a v2p mapping - AddRouterEntry = 60, // add a router entry for IP dest - DelRouterEntry = 61, // remove a router entry for IP dest - CreateXde = 70, // create a new xde device - DeleteXde = 71, // delete an xde device - SetXdeUnderlay = 72, // set xde underlay devices - ClearXdeUnderlay = 73, // clear xde underlay devices - SetExternalIps = 80, // set xde external IPs for a port - AllowCidr = 90, // allow ip block through gateway tx/rx - RemoveCidr = 91, // deny ip block through gateway tx/rx - SetMcastForwarding = 100, // set multicast forwarding entries - ClearMcastForwarding = 101, // clear multicast forwarding entries - DumpMcastForwarding = 102, // dump multicast forwarding table - McastSubscribe = 103, // subscribe a port to a multicast group - McastUnsubscribe = 104, // unsubscribe a port from a multicast group - SetMcast2Phys = 105, // set M2P mapping (group -> underlay mcast) - ClearMcast2Phys = 106, // clear M2P mapping - DumpMcastSubscriptions = 107, // dump multicast subscription table - McastUnsubscribeAll = 108, // unsubscribe all ports from a multicast group + /// List all ports. + ListPorts = 1, + + /// Add a firewall rule. + AddFwRule = 20, + /// Remove a firewall rule. + RemFwRule = 21, + /// Set/replace all firewall rules at once. + SetFwRules = 22, + + /// Read out TCP flows and statistics. + DumpTcpFlows = 30, + /// Read out installed rules and hit counters in a given layer. + DumpLayer = 31, + /// Read out UFT (fastpath) flow entries and their associated counters. + DumpUft = 32, + /// List the layers on a given port. + ListLayers = 33, + + /// Clear the UFT (fastpath) for a port. + ClearUft = 40, + /// Clear a layer's flow table. + ClearLft = 41, + + /// Set a V2P mapping. + SetVirt2Phys = 50, + /// Read out all V2P mappings. + DumpVirt2Phys = 51, + /// Set a V2B mapping. + SetVirt2Boundary = 52, + /// Remove a V2B mapping. + ClearVirt2Boundary = 53, + /// Read out all V2B mappings. + DumpVirt2Boundary = 54, + /// Remove a V2P mapping. + ClearVirt2Phys = 55, + + /// Add a router entry for an IP destination CIDR. + AddRouterEntry = 60, + /// Remove a router entry for an IP destination CIDR. + DelRouterEntry = 61, + + /// Create a new XDE device. + /// + /// Requires that `SetXdeUnderlay` has been successfully called. + CreateXde = 70, + /// Delete an XDE device. + DeleteXde = 71, + /// Set the physical devices which XDE should transmit over. + SetXdeUnderlay = 72, + /// Unbind the underlay devices. + /// + /// Requires that no XDE ports exist. + ClearXdeUnderlay = 73, + + /// Set all external IP config for a port. + SetExternalIps = 80, + + /// Add a transit IP CIDR to this port's allow list. + /// + /// NOOPs if the given CIDR is an attached subnet. + AllowCidr = 90, + /// Remove a transit IP CIDR from this port's allow list. + /// + /// NOOPs if the given CIDR is an attached subnet. + RemoveCidr = 91, + /// Add or set the config of an attached subnet. + AttachSubnet = 92, + /// Remove an attached subnet. + DetachSubnet = 93, + + /// Set multicast forwarding entries. + SetMcastForwarding = 100, + /// Clear multicast forwarding entries. + ClearMcastForwarding = 101, + /// Read out the multicast forwarding table. + DumpMcastForwarding = 102, + /// Subscribe a port to a multicast group. + McastSubscribe = 103, + /// Unsubscribe a port to a multicast group. + McastUnsubscribe = 104, + /// Set an M2P mapping (group -> underlay mcast). + SetMcast2Phys = 105, + /// Remove an M2P mapping. + ClearMcast2Phys = 106, + /// Read out the table of multicast subscriptions. + DumpMcastSubscriptions = 107, + /// Unsubscribe all ports from a multicast group. + McastUnsubscribeAll = 108, } impl TryFrom for OpteCmd { diff --git a/crates/opte-api/src/lib.rs b/crates/opte-api/src/lib.rs index 558a6e41..99fb077e 100644 --- a/crates/opte-api/src/lib.rs +++ b/crates/opte-api/src/lib.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -// Copyright 2025 Oxide Computer Company +// Copyright 2026 Oxide Computer Company #![no_std] #![deny(unreachable_patterns)] @@ -51,7 +51,7 @@ pub use ulp::*; /// /// We rely on CI and the check-api-version.sh script to verify that /// this number is incremented anytime the oxide-api code changes. -pub const API_VERSION: u64 = 38; +pub const API_VERSION: u64 = 39; /// Major version of the OPTE package. pub const MAJOR_VERSION: u64 = 0; diff --git a/lib/opte-ioctl/src/lib.rs b/lib/opte-ioctl/src/lib.rs index 510fc9a1..1c4147d6 100644 --- a/lib/opte-ioctl/src/lib.rs +++ b/lib/opte-ioctl/src/lib.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -// Copyright 2025 Oxide Computer Company +// Copyright 2026 Oxide Computer Company use opte::api::API_VERSION; use opte::api::ClearLftReq; @@ -27,6 +27,8 @@ use opte::api::XDE_IOC_OPTE_CMD; use oxide_vpc::api::AddFwRuleReq; use oxide_vpc::api::AddRouterEntryReq; use oxide_vpc::api::AllowCidrReq; +use oxide_vpc::api::AttachSubnetReq; +use oxide_vpc::api::AttachedSubnetConfig; use oxide_vpc::api::ClearMcast2PhysReq; use oxide_vpc::api::ClearMcastForwardingReq; use oxide_vpc::api::ClearVirt2BoundaryReq; @@ -35,7 +37,8 @@ use oxide_vpc::api::CreateXdeReq; use oxide_vpc::api::DelRouterEntryReq; use oxide_vpc::api::DelRouterEntryResp; use oxide_vpc::api::DeleteXdeReq; -use oxide_vpc::api::DhcpCfg; +use oxide_vpc::api::DetachSubnetReq; +use oxide_vpc::api::DetachSubnetResp; use oxide_vpc::api::DumpMcastForwardingResp; use oxide_vpc::api::DumpMcastSubscriptionsResp; use oxide_vpc::api::DumpVirt2BoundaryResp; @@ -123,7 +126,6 @@ impl OpteHdl { &self, name: &str, cfg: VpcCfg, - dhcp: DhcpCfg, passthrough: bool, ) -> Result { use libnet::link; @@ -136,7 +138,7 @@ impl OpteHdl { let xde_devname = name.into(); let cmd = OpteCmd::CreateXde; - let req = CreateXdeReq { xde_devname, linkid, cfg, dhcp, passthrough }; + let req = CreateXdeReq { xde_devname, linkid, cfg, passthrough }; let res = run_cmd_ioctl(self.device.as_raw_fd(), cmd, Some(&req)); @@ -395,6 +397,37 @@ impl OpteHdl { ) } + pub fn attach_subnet( + &self, + port_name: &str, + cidr: IpCidr, + is_external: bool, + ) -> Result { + let cmd = OpteCmd::AttachSubnet; + run_cmd_ioctl( + self.device.as_raw_fd(), + cmd, + Some(&AttachSubnetReq { + cidr, + port_name: port_name.into(), + cfg: AttachedSubnetConfig { is_external }, + }), + ) + } + + pub fn detach_subnet( + &self, + port_name: &str, + cidr: IpCidr, + ) -> Result { + let cmd = OpteCmd::DetachSubnet; + run_cmd_ioctl( + self.device.as_raw_fd(), + cmd, + Some(&DetachSubnetReq { cidr, port_name: port_name.into() }), + ) + } + /// Return the TCP flows. pub fn dump_tcp_flows( &self, diff --git a/lib/opte-test-utils/src/lib.rs b/lib/opte-test-utils/src/lib.rs index bb128b44..e3c4da18 100644 --- a/lib/opte-test-utils/src/lib.rs +++ b/lib/opte-test-utils/src/lib.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -// Copyright 2025 Oxide Computer Company +// Copyright 2026 Oxide Computer Company //! Common routines for integration tests. @@ -93,8 +93,10 @@ pub use oxide_vpc::engine::overlay::VpcMappings; pub use oxide_vpc::engine::router; pub use port_state::*; pub use smoltcp::wire::IpProtocol; +use std::collections::BTreeMap; pub use std::num::NonZeroU32; pub use std::sync::Arc; +use std::sync::LazyLock; /// Expects that a packet result is modified, and applies that modification. #[macro_export] @@ -179,6 +181,8 @@ pub fn g1_cfg() -> VpcCfg { ephemeral_ip: None, floating_ips: vec![], }, + attached_subnets: BTreeMap::new(), + transit_ips: BTreeMap::new(), }, ipv6: Ipv6Cfg { vpc_subnet: "fd00::/64".parse().unwrap(), @@ -192,6 +196,8 @@ pub fn g1_cfg() -> VpcCfg { ephemeral_ip: None, floating_ips: vec![], }, + attached_subnets: BTreeMap::new(), + transit_ips: BTreeMap::new(), }, }; g1_cfg2(ip_cfg) @@ -207,6 +213,7 @@ pub fn g1_cfg2(ip_cfg: IpCfg) -> VpcCfg { phys_ip: Ipv6Addr::from([ 0xFD00, 0x0000, 0x00F7, 0x0101, 0x0000, 0x0000, 0x0000, 0x0001, ]), + dhcp: base_dhcp_config(), } } @@ -224,6 +231,8 @@ pub fn g2_cfg() -> VpcCfg { ephemeral_ip: None, floating_ips: vec![], }, + attached_subnets: BTreeMap::new(), + transit_ips: BTreeMap::new(), }, ipv6: Ipv6Cfg { vpc_subnet: "fd00::/64".parse().unwrap(), @@ -237,6 +246,8 @@ pub fn g2_cfg() -> VpcCfg { ephemeral_ip: None, floating_ips: vec![], }, + attached_subnets: BTreeMap::new(), + transit_ips: BTreeMap::new(), }, }; VpcCfg { @@ -248,6 +259,7 @@ pub fn g2_cfg() -> VpcCfg { phys_ip: Ipv6Addr::from([ 0xFD00, 0x0000, 0x00F7, 0x0116, 0x0000, 0x0000, 0x0000, 0x0001, ]), + dhcp: base_dhcp_config(), } } @@ -268,10 +280,8 @@ fn oxide_net_builder( let snat_limit = NonZeroU32::new(8096).unwrap(); let one_limit = NonZeroU32::new(1).unwrap(); - let dhcp = base_dhcp_config(); - firewall::setup(&mut pb, fw_limit).expect("failed to add firewall layer"); - gateway::setup(&pb, cfg, vpc_map, fw_limit, &dhcp) + gateway::setup(&pb, cfg, vpc_map, fw_limit) .expect("failed to setup gateway layer"); router::setup(&pb, cfg, one_limit).expect("failed to add router layer"); nat::setup(&mut pb, cfg, snat_limit).expect("failed to add nat layer"); @@ -355,14 +365,14 @@ pub fn oxide_net_setup2( v2b.set( "0.0.0.0/0".parse().unwrap(), vec![TunnelEndpoint { - ip: "fd00:9900::1".parse().unwrap(), + ip: BS_IP_ADDR, vni: Vni::new(BOUNDARY_SERVICES_VNI).unwrap(), }], ); v2b.set( "::/0".parse().unwrap(), vec![TunnelEndpoint { - ip: "fd00:9900::1".parse().unwrap(), + ip: BS_IP_ADDR, vni: Vni::new(BOUNDARY_SERVICES_VNI).unwrap(), }], ); @@ -987,6 +997,12 @@ pub struct TestIpPhys { pub vni: Vni, } +pub static BSVC_PHYS: LazyLock = LazyLock::new(|| TestIpPhys { + ip: BS_IP_ADDR, + mac: BS_MAC_ADDR, + vni: Vni::new(BOUNDARY_SERVICES_VNI).unwrap(), +}); + /// Encapsulate a guest packet, marking that it has arrived from beyond /// the rack. #[must_use] diff --git a/lib/opte/src/dynamic.rs b/lib/opte/src/dynamic.rs index 16584419..b3e8e72a 100644 --- a/lib/opte/src/dynamic.rs +++ b/lib/opte/src/dynamic.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -// Copyright 2025 Oxide Computer Company +// Copyright 2026 Oxide Computer Company //! A KRwLock-based wrapper for dynamically updateable resources (e.g., config), //! and for memoizing the outputs generated from those resources. @@ -49,6 +49,22 @@ impl Dynamic { _ = self.0.epoch.fetch_add(1, Ordering::Relaxed); } + /// Conditionally update `self`, holding exclusive access on the inner + /// value. + /// + /// `f(...)` should return `Some(val)` if an update should be applied. + /// Returns `true` if `f(...)` returned `Some`. + pub fn update(&self, f: impl FnOnce(&T) -> Option) -> bool { + let mut inner = self.0.inner.write(); + if let Some(new_val) = f(&inner) { + *inner = new_val.into(); + _ = self.0.epoch.fetch_add(1, Ordering::Relaxed); + true + } else { + false + } + } + pub fn load(&self) -> Snapshot { let value_locked = self.0.inner.read(); let value = Arc::clone(&*value_locked); diff --git a/lib/opte/src/engine/layer.rs b/lib/opte/src/engine/layer.rs index d6413e27..643da97e 100644 --- a/lib/opte/src/engine/layer.rs +++ b/lib/opte/src/engine/layer.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -// Copyright 2025 Oxide Computer Company +// Copyright 2026 Oxide Computer Company //! A layer in a port. @@ -841,7 +841,7 @@ impl Layer { Some(ActionDescEntry::Desc(desc)) => { self.stats.vals.in_lft_hit += 1; let flow_before = *pkt.flow(); - let ht = desc.gen_ht(Direction::In); + let ht = desc.gen_ht(Direction::In, ameta); pkt.hdr_transform(&ht)?; xforms.hdr.push(ht); ht_probe( @@ -1032,7 +1032,7 @@ impl Layer { }; let flow_before = *pkt.flow(); - let ht_in = desc.gen_ht(In); + let ht_in = desc.gen_ht(In, ameta); pkt.hdr_transform(&ht_in)?; xforms.hdr.push(ht_in); ht_probe( @@ -1127,7 +1127,7 @@ impl Layer { Some(ActionDescEntry::Desc(desc)) => { self.stats.vals.out_lft_hit += 1; let flow_before = *pkt.flow(); - let ht = desc.gen_ht(Direction::Out); + let ht = desc.gen_ht(Direction::Out, ameta); pkt.hdr_transform(&ht)?; xforms.hdr.push(ht); ht_probe( @@ -1320,7 +1320,7 @@ impl Layer { }; let flow_before = *pkt.flow(); - let ht_out = desc.gen_ht(Out); + let ht_out = desc.gen_ht(Out, ameta); pkt.hdr_transform(&ht_out)?; xforms.hdr.push(ht_out); ht_probe( diff --git a/lib/opte/src/engine/nat.rs b/lib/opte/src/engine/nat.rs index e8f7e190..99e26267 100644 --- a/lib/opte/src/engine/nat.rs +++ b/lib/opte/src/engine/nat.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -// Copyright 2025 Oxide Computer Company +// Copyright 2026 Oxide Computer Company //! 1:1 NAT. @@ -22,15 +22,19 @@ use super::packet::Packet; use super::parse::Ulp; use super::parse::UlpRepr; use super::port::meta::ActionMeta; +use super::port::meta::ActionMetaValue; use super::predicate::DataPredicate; use super::predicate::Predicate; use super::rule; use super::rule::ActionDesc; use super::rule::AllowOrDeny; use super::rule::HdrTransform; +use super::rule::MetaAction; use super::rule::StatefulAction; use crate::engine::snat::ConcreteIpAddr; +use alloc::borrow::Cow; use alloc::boxed::Box; +use alloc::string::String; use alloc::string::ToString; use alloc::sync::Arc; use alloc::vec::Vec; @@ -106,7 +110,7 @@ impl StatefulAction for OutboundNat { &self, flow_id: &InnerFlowId, _pkt: &Packet, - _meta: &mut ActionMeta, + _meta: &ActionMeta, ) -> rule::GenDescResult { // When we have several external IPs at our disposal, we are // to use them equally. @@ -169,7 +173,7 @@ impl StatefulAction for InboundNat { &self, flow_id: &InnerFlowId, _pkt: &Packet, - _meta: &mut ActionMeta, + _meta: &ActionMeta, ) -> rule::GenDescResult { // We rely on the attached predicates to filter out IPs which are *not* // registered to this port. @@ -199,11 +203,13 @@ pub struct NatDesc { pub const NAT_NAME: &str = "NAT"; impl ActionDesc for NatDesc { - fn gen_ht(&self, dir: Direction) -> HdrTransform { + fn gen_ht(&self, dir: Direction, meta: &mut ActionMeta) -> HdrTransform { match dir { Direction::Out => { let ip = IpMod::new_src(self.external_ip); + meta.insert_typed(&ExternalIpTag); + HdrTransform { name: NAT_NAME.to_string(), inner_ip: HeaderAction::Modify(ip), @@ -385,6 +391,51 @@ impl fmt::Display for IcmpV6Nat { } } +/// Mark matching packets as being sent outbound from an external IP. +#[derive(Debug)] +pub struct ExternalIpTagger; + +impl fmt::Display for ExternalIpTagger { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "ExternalIpTagger") + } +} + +impl MetaAction for ExternalIpTagger { + fn implicit_preds(&self) -> (Vec, Vec) { + (vec![], vec![]) + } + + fn mod_meta( + &self, + _flow_id: &InnerFlowId, + meta: &mut ActionMeta, + ) -> rule::ModMetaResult { + meta.insert_typed(&ExternalIpTag); + rule::ModMetaResult::Ok(AllowOrDeny::Allow(())) + } +} + +/// A unit-valued tag marking outbound packets using an external IP. +#[derive(Debug)] +pub struct ExternalIpTag; + +impl ActionMetaValue for ExternalIpTag { + const KEY: &'static str = "external-ip-applied"; + + fn as_meta(&self) -> Cow<'static, str> { + Cow::Borrowed(Self::KEY) + } + + fn from_meta(s: &str) -> Result { + if s == Self::KEY { + Ok(Self) + } else { + Err("malformed ExternalIpTag value".into()) + } + } +} + #[cfg(test)] mod test { use super::*; @@ -459,7 +510,7 @@ mod test { // Verify descriptor generation. // ================================================================ let flow_out = InnerFlowId::from(pkt.meta()); - let desc = match nat.gen_desc(&flow_out, &pkt, &mut ameta) { + let desc = match nat.gen_desc(&flow_out, &pkt, &ameta) { Ok(AllowOrDeny::Allow(desc)) => desc, _ => panic!("expected AllowOrDeny::Allow(desc) result"), }; @@ -467,7 +518,7 @@ mod test { // ================================================================ // Verify outbound header transformation // ================================================================ - let out_ht = desc.gen_ht(Direction::Out); + let out_ht = desc.gen_ht(Direction::Out, &mut ameta); let pmo = pkt.meta_mut(); out_ht.run(pmo).unwrap(); @@ -524,7 +575,7 @@ mod test { .to_full_meta(); let pmi = pkt.meta_mut(); - let in_ht = desc.gen_ht(Direction::In); + let in_ht = desc.gen_ht(Direction::In, &mut ameta); in_ht.run(pmi).unwrap(); let ether_meta = pmi.inner_ether(); diff --git a/lib/opte/src/engine/port/meta.rs b/lib/opte/src/engine/port/meta.rs index 968b6986..17d673b4 100644 --- a/lib/opte/src/engine/port/meta.rs +++ b/lib/opte/src/engine/port/meta.rs @@ -2,11 +2,12 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -// Copyright 2025 Oxide Computer Company +// Copyright 2026 Oxide Computer Company use alloc::borrow::Cow; use alloc::collections::BTreeMap; use alloc::string::String; +use core::marker::PhantomData; /// A value meant to be used in the [`ActionMeta`] map. /// @@ -34,7 +35,7 @@ pub trait ActionMetaValue: Sized { fn as_meta(&self) -> Cow<'static, str>; /// Attempt to create a value assuming that `s` was created - /// with [`Self::as_meta()`]. + /// with [`ActionMetaValue::as_meta`]. fn from_meta(s: &str) -> Result; } @@ -77,6 +78,16 @@ impl ActionMeta { self.inner.insert(key, val) } + /// Insert a value with a type-determined key into the map, + /// replacing any existing key-value pair. Return the value + /// being replaced, or `None`. + pub fn insert_typed( + &mut self, + val: &impl ActionMetaValue, + ) -> Option> { + self.insert(val.key(), val.as_meta()) + } + /// Remove the key-value pair with the specified key. Return /// the value, or `None` if no such entry exists. pub fn remove(&mut self, key: &str) -> Option> { @@ -89,23 +100,57 @@ impl ActionMeta { self.inner.get(key).map(|v| &**v) } + /// Get a reference to the value at a well known key key, or `None` + /// if no such entry exists. + pub fn get_typed( + &self, + ) -> Result> { + let raw_val = + self.get(T::KEY).ok_or(ActionMetaError::NotFound(PhantomData))?; + + T::from_meta(raw_val) + .map_err(|err| ActionMetaError::ParseFailed { raw_val, err }) + } + /// Records whether this packet's destination can be reached using only /// internal/private paths. /// /// The dataplane may use this to choose a larger (jumbo-frame) MSS for /// TCP segmentation, or rely on other aspects of its internal network. pub fn set_internal_target(&mut self, val: bool) { - _ = self - .insert(InternalTarget::KEY.into(), InternalTarget(val).as_meta()); + _ = self.insert_typed(&InternalTarget(val)); } /// Returns whether this packet's destination can be reached using only /// internal/private paths. pub fn is_internal_target(&self) -> bool { - self.get(InternalTarget::KEY) - .and_then(|v| InternalTarget::from_meta(v).ok()) - .unwrap_or_default() - .0 + self.get_typed::().unwrap_or_default().0 + } +} + +/// Failure modes when reading a target `impl ActionMetaValue` from [`ActionMeta`]. +#[derive(Debug)] +pub enum ActionMetaError<'a, T> { + /// No value was stored using the type's well-known key. + NotFound(PhantomData), + /// The stored value could not be deserialised into the requested type. + ParseFailed { raw_val: &'a str, err: String }, +} + +impl<'a, T: core::fmt::Debug> core::error::Error for ActionMetaError<'a, T> {} + +impl<'a, T> core::fmt::Display for ActionMetaError<'a, T> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + match self { + ActionMetaError::NotFound(_) => write!( + f, + "no {} metadata entry found", + core::any::type_name::() + ), + ActionMetaError::ParseFailed { raw_val, err } => { + write!(f, "failed to parse metadata entry '{raw_val}': {err}") + } + } } } diff --git a/lib/opte/src/engine/predicate.rs b/lib/opte/src/engine/predicate.rs index 4527efe9..c23c44a3 100644 --- a/lib/opte/src/engine/predicate.rs +++ b/lib/opte/src/engine/predicate.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -// Copyright 2025 Oxide Computer Company +// Copyright 2026 Oxide Computer Company //! Predicates used for `Rule` matching. @@ -23,6 +23,7 @@ use super::ip::v6::Ipv6Ref; use super::ip::v6::v6_get_next_header; use super::packet::MblkPacketData; use super::port::meta::ActionMeta; +use super::port::meta::ActionMetaValue; use alloc::boxed::Box; use alloc::string::String; use alloc::string::ToString; @@ -610,6 +611,11 @@ impl Predicate { false } + + /// Create a `Predicate::Meta` matching a well-specified value. + pub fn from_action_meta(val: T) -> Self { + Self::Meta(val.key().into(), val.as_meta().into()) + } } #[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] diff --git a/lib/opte/src/engine/rule.rs b/lib/opte/src/engine/rule.rs index 4247adb8..9cc4f9c0 100644 --- a/lib/opte/src/engine/rule.rs +++ b/lib/opte/src/engine/rule.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -// Copyright 2025 Oxide Computer Company +// Copyright 2026 Oxide Computer Company //! Rules and actions. @@ -174,8 +174,9 @@ where /// [`HdrTransform`] which implements the desired action. An /// ActionDesc is created by a [`StatefulAction`] implementation. pub trait ActionDesc: Send + Sync { - /// Generate the [`HdrTransform`] which implements this descriptor. - fn gen_ht(&self, dir: Direction) -> HdrTransform; + /// Generate the [`HdrTransform`] which implements this descriptor, and + /// apply any modifications to the [`ActionMeta`]. + fn gen_ht(&self, dir: Direction, meta: &mut ActionMeta) -> HdrTransform; /// Generate a body transformation. /// @@ -251,7 +252,7 @@ impl IdentityDesc { } impl ActionDesc for IdentityDesc { - fn gen_ht(&self, _dir: Direction) -> HdrTransform { + fn gen_ht(&self, _dir: Direction, _meta: &mut ActionMeta) -> HdrTransform { Default::default() } @@ -758,7 +759,7 @@ pub trait StatefulAction: Display + Send + Sync { &self, flow_id: &InnerFlowId, pkt: &Packet, - meta: &mut ActionMeta, + meta: &ActionMeta, ) -> GenDescResult; fn implicit_preds(&self) -> (Vec, Vec); diff --git a/lib/opte/src/engine/snat.rs b/lib/opte/src/engine/snat.rs index d3bbff22..1765abd7 100644 --- a/lib/opte/src/engine/snat.rs +++ b/lib/opte/src/engine/snat.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -// Copyright 2025 Oxide Computer Company +// Copyright 2026 Oxide Computer Company //! Types for working with IP Source NAT, both IPv4 and IPv6. @@ -33,6 +33,7 @@ use crate::api::L4Info; use crate::api::PortInfo; use crate::ddi::sync::KMutex; use crate::engine::icmp::QueryEcho; +use crate::engine::nat::ExternalIpTag; use alloc::collections::btree_map::BTreeMap; use alloc::string::ToString; use alloc::sync::Arc; @@ -91,7 +92,7 @@ type SNatAlloc = FiniteHandle>; mod private { use opte_api::Protocol; - pub trait Ip: Into { + pub trait Ip: Into + Send + Sync { const MESSAGE_PROTOCOL: Protocol; } @@ -218,7 +219,7 @@ impl From for GenDescError { } } -impl SNat { +impl SNat { pub fn new(addr: T) -> Self { SNat { priv_ip: addr, @@ -299,7 +300,7 @@ impl Display for SNat { } } -impl StatefulAction for SNat +impl StatefulAction for SNat where SNat: Display, { @@ -307,7 +308,7 @@ where &self, flow_id: &InnerFlowId, pkt: &Packet, - _meta: &mut ActionMeta, + _meta: &ActionMeta, ) -> GenDescResult { let proto = flow_id.protocol(); let priv_port = match flow_id.l4_info() { @@ -367,13 +368,15 @@ pub struct SNatDesc { pub const SNAT_NAME: &str = "SNAT"; -impl ActionDesc for SNatDesc { - fn gen_ht(&self, dir: Direction) -> HdrTransform { +impl ActionDesc for SNatDesc { + fn gen_ht(&self, dir: Direction, meta: &mut ActionMeta) -> HdrTransform { match dir { // Outbound traffic needs its source IP and source port Direction::Out => { let ip = IpMod::new_src(self.nat.entry.ip.into()); + meta.insert_typed(&ExternalIpTag); + HdrTransform { name: SNAT_NAME.to_string(), inner_ip: HeaderAction::Modify(ip), @@ -425,16 +428,18 @@ pub struct SNatIcmpEchoDesc { pub const SNAT_ICMP_ECHO_NAME: &str = "SNAT_ICMP_ECHO"; -impl ActionDesc for SNatIcmpEchoDesc { +impl ActionDesc for SNatIcmpEchoDesc { // SNAT needs to generate an additional transform for ICMP traffic in // order to treat the Echo Identifier as a psuedo ULP port. - fn gen_ht(&self, dir: Direction) -> HdrTransform { + fn gen_ht(&self, dir: Direction, meta: &mut ActionMeta) -> HdrTransform { match dir { // Outbound traffic needs its source IP rewritten, and its // 'source port' placed into the ICMP echo ID field. Direction::Out => { let ip = IpMod::new_src(self.nat.entry.ip.into()); + meta.insert_typed(&ExternalIpTag); + HdrTransform { name: SNAT_NAME.to_string(), inner_ip: HeaderAction::Modify(ip), @@ -560,7 +565,7 @@ mod test { // Verify descriptor generation. // ================================================================ let flow_out = InnerFlowId::from(pkt.meta()); - let desc = match snat.gen_desc(&flow_out, &pkt, &mut action_meta) { + let desc = match snat.gen_desc(&flow_out, &pkt, &action_meta) { Ok(AllowOrDeny::Allow(desc)) => desc, _ => panic!("expected AllowOrDeny::Allow(desc) result"), }; @@ -569,7 +574,7 @@ mod test { // ================================================================ // Verify outbound header transformation // ================================================================ - let out_ht = desc.gen_ht(Direction::Out); + let out_ht = desc.gen_ht(Direction::Out, &mut action_meta); out_ht.run(pkt.meta_mut()).unwrap(); let pmo = pkt.meta(); @@ -623,7 +628,7 @@ mod test { .to_full_meta(); pkt.compute_checksums(); - let in_ht = desc.gen_ht(Direction::In); + let in_ht = desc.gen_ht(Direction::In, &mut action_meta); in_ht.run(pkt.meta_mut()).unwrap(); let pmi = pkt.meta(); diff --git a/lib/oxide-vpc/src/api.rs b/lib/oxide-vpc/src/api.rs index 8c67ec25..dda0cbdb 100644 --- a/lib/oxide-vpc/src/api.rs +++ b/lib/oxide-vpc/src/api.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -// Copyright 2024 Oxide Computer Company +// Copyright 2026 Oxide Computer Company use alloc::collections::BTreeMap; use alloc::collections::BTreeSet; @@ -130,6 +130,26 @@ pub struct BoundaryServices { pub mac: MacAddr, } +/// Configuration for a subnet completely owned by a NIC. +/// +/// When configured this port will allow all in/out traffic matching a CIDR to +/// be received/sent. +#[derive(Debug, Clone, Serialize, Deserialize, Default, Eq, PartialEq)] +pub struct AttachedSubnetConfig { + /// Denotes whether this attached subnet is an external IP block, + /// in which case OPTE will not apply NAT on matching traffic. + pub is_external: bool, +} + +/// Configuration for an exceptions to source/destination address filtering. +#[derive(Debug, Clone, Serialize, Deserialize, Default)] +pub struct TransitIpConfig { + /// Allow inbound traffic with a destination IP in the target CIDR. + pub allow_in: bool, + /// Allow outbound traffic with a source IP in the target CIDR. + pub allow_out: bool, +} + /// The IPv4 configuration of a VPC guest. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct Ipv4Cfg { @@ -149,6 +169,13 @@ pub struct Ipv4Cfg { /// External IP assignments used for rack-external communication. pub external_ips: ExternalIpCfg, + + /// Subnets owned by this NIC. + pub attached_subnets: BTreeMap, + + /// Exceptions to source/destination address filtering without the guarantee + /// of ownership provided by `attached_subnets`. + pub transit_ips: BTreeMap, } /// The IPv6 configuration of a VPC guest. @@ -174,6 +201,13 @@ pub struct Ipv6Cfg { /// External IP assignments used for rack-external communication. pub external_ips: ExternalIpCfg, + + /// Subnets owned by this NIC. + pub attached_subnets: BTreeMap, + + /// Exceptions to source/destination address filtering without the guarantee + /// of ownership provided by `attached_subnets`. + pub transit_ips: BTreeMap, } /// Configuration of NAT assignments used by a VPC guest for external networking. @@ -264,6 +298,9 @@ pub struct VpcCfg { /// The host (sled) IPv6 address. All guests on the same sled are /// sourced to a single IPv6 address. pub phys_ip: Ipv6Addr, + + /// Configuration for DHCP responses created by OPTE + pub dhcp: DhcpCfg, } impl VpcCfg { @@ -581,9 +618,6 @@ pub struct CreateXdeReq { /// details. pub cfg: VpcCfg, - /// Configuration for DHCP responses created by OPTE - pub dhcp: DhcpCfg, - /// This is a development tool for completely bypassing OPTE processing. /// /// XXX Pretty sure we aren't making much use of this anymore, and @@ -837,12 +871,14 @@ pub struct McastUnsubscribeAllReq { pub group: IpAddr, } +pub type InternetGatewayMap = BTreeMap>; + #[derive(Clone, Debug, Deserialize, Serialize)] pub struct SetExternalIpsReq { pub port_name: String, pub external_ips_v4: Option>, pub external_ips_v6: Option>, - pub inet_gw_map: Option>>, + pub inet_gw_map: Option, } #[derive(Debug, Deserialize, Serialize)] @@ -1291,6 +1327,31 @@ pub enum RemoveCidrResp { impl opte::api::cmd::CmdOk for RemoveCidrResp {} +/// Add an entry to the gateway allowing a port to send or receive +/// traffic on a CIDR other than its private IP. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct AttachSubnetReq { + pub port_name: String, + pub cidr: IpCidr, + pub cfg: AttachedSubnetConfig, +} + +/// Remove entries from the gateway allowing a port to send or receive +/// traffic on a specific CIDR other than its private IP. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct DetachSubnetReq { + pub port_name: String, + pub cidr: IpCidr, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +pub enum DetachSubnetResp { + Ok(IpCidr), + NotFound, +} + +impl opte::api::cmd::CmdOk for DetachSubnetResp {} + #[cfg(test)] pub mod tests { use super::*; @@ -1392,6 +1453,8 @@ pub mod tests { floating_ips: vec![], }, vpc_subnet: "10.0.0.0/24".parse().unwrap(), + attached_subnets: BTreeMap::new(), + transit_ips: BTreeMap::new(), }, ipv6: Ipv6Cfg { private_ip: "fd00::5".parse().unwrap(), @@ -1405,9 +1468,12 @@ pub mod tests { floating_ips: vec![], }, vpc_subnet: "fd00::/64".parse().unwrap(), + attached_subnets: BTreeMap::new(), + transit_ips: BTreeMap::new(), }, }, vni: Vni::new(100u32).unwrap(), + dhcp: DhcpCfg::default(), } } } diff --git a/lib/oxide-vpc/src/cfg.rs b/lib/oxide-vpc/src/cfg.rs index d79767b4..20122f42 100644 --- a/lib/oxide-vpc/src/cfg.rs +++ b/lib/oxide-vpc/src/cfg.rs @@ -1,9 +1,18 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +// Copyright 2026 Oxide Computer Company + //! Reconfigurable, internal configuration built from `oxide_vpc::api`. use crate::api; +use crate::api::AttachedSubnetConfig; use crate::api::ExternalIpCfg; #[cfg(any(feature = "test-help", test))] use crate::api::PhysNet; +use crate::api::TransitIpConfig; +use alloc::collections::BTreeMap; use opte::api::*; use opte::dynamic::Dynamic; @@ -25,6 +34,13 @@ pub struct Ipv4Cfg { /// External IP assignments used for rack-external communication. pub external_ips: Dynamic>, + + /// Subnets owned by this NIC. + pub attached_subnets: Dynamic>, + + /// Exceptions to source/destination address filtering without the guarantee + /// of ownership provided by `attached_subnets`. + pub transit_ips: Dynamic>, } /// The IPv6 configuration of a VPC guest. @@ -50,6 +66,13 @@ pub struct Ipv6Cfg { /// External IP assignments used for rack-external communication. pub external_ips: Dynamic>, + + /// Subnets owned by this NIC. + pub attached_subnets: Dynamic>, + + /// Exceptions to source/destination address filtering without the guarantee + /// of ownership provided by `attached_subnets`. + pub transit_ips: Dynamic>, } /// The IP configuration of a VPC guest. @@ -84,6 +107,9 @@ pub struct VpcCfg { /// The host (sled) IPv6 address. All guests on the same sled are /// sourced to a single IPv6 address. pub phys_ip: Ipv6Addr, + + /// Configuration for DHCP responses created by OPTE + pub dhcp: DhcpCfg, } impl VpcCfg { @@ -184,6 +210,7 @@ impl From for VpcCfg { gateway_mac: value.gateway_mac, vni: value.vni, phys_ip: value.phys_ip, + dhcp: value.dhcp, } } } @@ -207,6 +234,8 @@ impl From for Ipv4Cfg { private_ip: value.private_ip, gateway_ip: value.gateway_ip, external_ips: value.external_ips.into(), + attached_subnets: value.attached_subnets.into(), + transit_ips: value.transit_ips.into(), } } } @@ -218,6 +247,8 @@ impl From for Ipv6Cfg { private_ip: value.private_ip, gateway_ip: value.gateway_ip, external_ips: value.external_ips.into(), + attached_subnets: value.attached_subnets.into(), + transit_ips: value.transit_ips.into(), } } } diff --git a/lib/oxide-vpc/src/engine/firewall.rs b/lib/oxide-vpc/src/engine/firewall.rs index ad0bbb05..78e16266 100644 --- a/lib/oxide-vpc/src/engine/firewall.rs +++ b/lib/oxide-vpc/src/engine/firewall.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -// Copyright 2025 Oxide Computer Company +// Copyright 2026 Oxide Computer Company //! The Oxide VPC firewall. //! @@ -18,9 +18,8 @@ use crate::api::Ports; pub use crate::api::ProtoFilter; use crate::api::RemFwRuleReq; use crate::api::SetFwRulesReq; -use crate::engine::overlay::ACTION_META_VNI; +use crate::engine::overlay::VniTag; use alloc::collections::BTreeSet; -use alloc::string::ToString; use alloc::vec::Vec; use core::num::NonZeroU32; use opte::api::Direction; @@ -236,10 +235,9 @@ impl Address { Predicate::InnerSrcIp6(vec![Ipv6AddrMatch::Prefix(ip6_sub)]), ), - (_, Address::Vni(vni)) => Some(Predicate::Meta( - ACTION_META_VNI.to_string(), - vni.to_string(), - )), + (_, Address::Vni(vni)) => { + Some(Predicate::from_action_meta(VniTag(vni))) + } } } } diff --git a/lib/oxide-vpc/src/engine/gateway/arp.rs b/lib/oxide-vpc/src/engine/gateway/arp.rs index d530ce16..808af5b8 100644 --- a/lib/oxide-vpc/src/engine/gateway/arp.rs +++ b/lib/oxide-vpc/src/engine/gateway/arp.rs @@ -2,23 +2,21 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -// Copyright 2023 Oxide Computer Company +// Copyright 2026 Oxide Computer Company //! The ARP implementation of the Virtual Gateway. -use crate::cfg::VpcCfg; -use opte::api::Direction; +use super::BuildCtx; use opte::api::MacAddr; use opte::api::OpteError; use opte::engine::ether::ETHER_TYPE_ARP; -use opte::engine::layer::Layer; use opte::engine::predicate::EtherAddrMatch; use opte::engine::predicate::EtherTypeMatch; use opte::engine::predicate::Predicate; use opte::engine::rule::Action; use opte::engine::rule::Rule; -pub fn setup(layer: &mut Layer, cfg: &VpcCfg) -> Result<(), OpteError> { +pub(crate) fn setup(ctx: &mut BuildCtx) -> Result<(), OpteError> { // ================================================================ // Outbound ARP Request for Gateway, from Guest // @@ -31,9 +29,11 @@ pub fn setup(layer: &mut Layer, cfg: &VpcCfg) -> Result<(), OpteError> { Predicate::InnerEtherDst(vec![EtherAddrMatch::Exact( MacAddr::BROADCAST, )]), - Predicate::InnerEtherSrc(vec![EtherAddrMatch::Exact(cfg.guest_mac)]), + Predicate::InnerEtherSrc(vec![EtherAddrMatch::Exact( + ctx.cfg.guest_mac, + )]), ]); - layer.add_rule(Direction::Out, rule.finalize()); + ctx.out_rules.push(rule.finalize()); Ok(()) } diff --git a/lib/oxide-vpc/src/engine/gateway/dhcp.rs b/lib/oxide-vpc/src/engine/gateway/dhcp.rs index d10698e6..6f3f82f4 100644 --- a/lib/oxide-vpc/src/engine/gateway/dhcp.rs +++ b/lib/oxide-vpc/src/engine/gateway/dhcp.rs @@ -2,31 +2,26 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -// Copyright 2024 Oxide Computer Company +// Copyright 2026 Oxide Computer Company //! The DHCP implementation of the Virtual Gateway. +use super::BuildCtx; use crate::cfg::Ipv4Cfg; -use crate::cfg::VpcCfg; use alloc::sync::Arc; -use opte::api::DhcpCfg; use opte::api::DhcpReplyType; -use opte::api::Direction; use opte::api::Ipv4Addr; use opte::api::Ipv4PrefixLen; use opte::api::OpteError; use opte::api::SubnetRouterPair; use opte::engine::dhcp::DhcpAction; use opte::engine::ip::v4::Ipv4Cidr; -use opte::engine::layer::Layer; use opte::engine::rule::Action; use opte::engine::rule::Rule; -pub fn setup( - layer: &mut Layer, - cfg: &VpcCfg, +pub(crate) fn setup( + ctx: &mut BuildCtx, ip_cfg: &Ipv4Cfg, - dhcp_cfg: DhcpCfg, ) -> Result<(), OpteError> { // All guest interfaces live on a `/32`-network in the Oxide VPC; // restricting the L2 domain to two nodes: the guest NIC and the @@ -65,35 +60,36 @@ pub fn setup( ); let offer = Action::Hairpin(Arc::new(DhcpAction { - client_mac: cfg.guest_mac, + client_mac: ctx.cfg.guest_mac, client_ip: ip_cfg.private_ip, subnet_prefix_len: Ipv4PrefixLen::NETMASK_ALL, - gw_mac: cfg.gateway_mac, + gw_mac: ctx.cfg.gateway_mac, gw_ip: ip_cfg.gateway_ip, reply_type: DhcpReplyType::Offer, re1, re2: Some(re2), re3: None, - dhcp_cfg: dhcp_cfg.clone(), + dhcp_cfg: ctx.cfg.dhcp.clone(), })); let ack = Action::Hairpin(Arc::new(DhcpAction { - client_mac: cfg.guest_mac, + client_mac: ctx.cfg.guest_mac, client_ip: ip_cfg.private_ip, subnet_prefix_len: Ipv4PrefixLen::NETMASK_ALL, - gw_mac: cfg.gateway_mac, + gw_mac: ctx.cfg.gateway_mac, gw_ip: ip_cfg.gateway_ip, reply_type: DhcpReplyType::Ack, re1, re2: Some(re2), re3: None, - dhcp_cfg, + dhcp_cfg: ctx.cfg.dhcp.clone(), })); let discover_rule = Rule::new(1, offer); - layer.add_rule(Direction::Out, discover_rule.finalize()); - let request_rule = Rule::new(1, ack); - layer.add_rule(Direction::Out, request_rule.finalize()); + + ctx.out_rules.push(discover_rule.finalize()); + ctx.out_rules.push(request_rule.finalize()); + Ok(()) } diff --git a/lib/oxide-vpc/src/engine/gateway/dhcpv6.rs b/lib/oxide-vpc/src/engine/gateway/dhcpv6.rs index 00bbec2a..1fbaebdd 100644 --- a/lib/oxide-vpc/src/engine/gateway/dhcpv6.rs +++ b/lib/oxide-vpc/src/engine/gateway/dhcpv6.rs @@ -2,28 +2,21 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -// Copyright 2023 Oxide Computer Company +// Copyright 2026 Oxide Computer Company //! The DHCPv6 implementation of the Virtual Gateway. -use crate::cfg::VpcCfg; +use super::BuildCtx; use alloc::sync::Arc; -use opte::api::DhcpCfg; -use opte::api::Direction; use opte::api::OpteError; use opte::engine::dhcpv6::AddressInfo; use opte::engine::dhcpv6::Dhcpv6Action; use opte::engine::dhcpv6::LeasedAddress; -use opte::engine::layer::Layer; use opte::engine::rule::Action; use opte::engine::rule::Rule; -pub fn setup( - layer: &mut Layer, - cfg: &VpcCfg, - dhcp_cfg: DhcpCfg, -) -> Result<(), OpteError> { - let ip_cfg = match cfg.ipv6_cfg() { +pub(crate) fn setup(ctx: &mut BuildCtx) -> Result<(), OpteError> { + let ip_cfg = match ctx.cfg.ipv6_cfg() { None => return Ok(()), Some(ip_cfg) => ip_cfg, }; @@ -35,15 +28,14 @@ pub fn setup( renew: u32::MAX, }; let action = Dhcpv6Action { - client_mac: cfg.guest_mac, - server_mac: cfg.gateway_mac, + client_mac: ctx.cfg.guest_mac, + server_mac: ctx.cfg.gateway_mac, addrs, sntp_servers: vec![], - dhcp_cfg, + dhcp_cfg: ctx.cfg.dhcp.clone(), }; let server = Action::Hairpin(Arc::new(action)); - let rule = Rule::new(1, server); - layer.add_rule(Direction::Out, rule.finalize()); + ctx.out_rules.push(Rule::new(1, server).finalize()); Ok(()) } diff --git a/lib/oxide-vpc/src/engine/gateway/icmp.rs b/lib/oxide-vpc/src/engine/gateway/icmp.rs index c4c48550..f7094891 100644 --- a/lib/oxide-vpc/src/engine/gateway/icmp.rs +++ b/lib/oxide-vpc/src/engine/gateway/icmp.rs @@ -2,23 +2,20 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -// Copyright 2023 Oxide Computer Company +// Copyright 2026 Oxide Computer Company //! The ICMP implementation of the Virtual Gateway. +use super::BuildCtx; use crate::cfg::Ipv4Cfg; -use crate::cfg::VpcCfg; use alloc::sync::Arc; -use opte::api::Direction; use opte::api::OpteError; use opte::engine::icmp::v4::IcmpEchoReply; -use opte::engine::layer::Layer; use opte::engine::rule::Action; use opte::engine::rule::Rule; -pub fn setup( - layer: &mut Layer, - cfg: &VpcCfg, +pub(crate) fn setup( + ctx: &mut BuildCtx, ip_cfg: &Ipv4Cfg, ) -> Result<(), OpteError> { // ================================================================ @@ -27,12 +24,12 @@ pub fn setup( let reply = Action::Hairpin(Arc::new(IcmpEchoReply { // Map an Echo from guest (src) -> gateway (dst) to an Echo // Reply from gateway (dst) -> guest (src). - echo_src_mac: cfg.guest_mac, + echo_src_mac: ctx.cfg.guest_mac, echo_src_ip: ip_cfg.private_ip, - echo_dst_mac: cfg.gateway_mac, + echo_dst_mac: ctx.cfg.gateway_mac, echo_dst_ip: ip_cfg.gateway_ip, })); let rule = Rule::new(1, reply); - layer.add_rule(Direction::Out, rule.finalize()); + ctx.out_rules.push(rule.finalize()); Ok(()) } diff --git a/lib/oxide-vpc/src/engine/gateway/icmpv6.rs b/lib/oxide-vpc/src/engine/gateway/icmpv6.rs index 0009acb1..6e27cbac 100644 --- a/lib/oxide-vpc/src/engine/gateway/icmpv6.rs +++ b/lib/oxide-vpc/src/engine/gateway/icmpv6.rs @@ -2,20 +2,18 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -// Copyright 2025 Oxide Computer Company +// Copyright 2026 Oxide Computer Company //! The ICMPv6 implementation of the Virtual Gateway. +use super::BuildCtx; use crate::cfg::Ipv6Cfg; -use crate::cfg::VpcCfg; use alloc::sync::Arc; -use opte::api::Direction; use opte::api::Ipv6Addr; use opte::api::OpteError; use opte::engine::icmp::v6::Icmpv6EchoReply; use opte::engine::icmp::v6::NeighborAdvertisement; use opte::engine::icmp::v6::RouterAdvertisement; -use opte::engine::layer::Layer; use opte::engine::predicate::Predicate; use opte::engine::rule::Action; use opte::engine::rule::Rule; @@ -34,34 +32,33 @@ use smoltcp::wire::Icmpv6Message; // - Respond to NDP Neighbor Solicitations from the guest to the gateway. This // includes solicitations unicast to the gateway, and also delivered to the // solicited-node multicast group. -pub fn setup( - layer: &mut Layer, - cfg: &VpcCfg, +pub(crate) fn setup( + ctx: &mut BuildCtx, ip_cfg: &Ipv6Cfg, ) -> Result<(), OpteError> { - let dst_ip = Ipv6Addr::from_eui64(&cfg.gateway_mac); + let dst_ip = Ipv6Addr::from_eui64(&ctx.cfg.gateway_mac); let hairpins = [ // We need to hairpin echo requests from either the VPC-private or // link-local address of the guest, to OPTE's link-local. Action::Hairpin(Arc::new(Icmpv6EchoReply { - src_mac: cfg.guest_mac, + src_mac: ctx.cfg.guest_mac, src_ip: ip_cfg.private_ip, - dst_mac: cfg.gateway_mac, + dst_mac: ctx.cfg.gateway_mac, dst_ip, })), Action::Hairpin(Arc::new(Icmpv6EchoReply { - src_mac: cfg.guest_mac, - src_ip: Ipv6Addr::from_eui64(&cfg.guest_mac), - dst_mac: cfg.gateway_mac, + src_mac: ctx.cfg.guest_mac, + src_ip: Ipv6Addr::from_eui64(&ctx.cfg.guest_mac), + dst_mac: ctx.cfg.gateway_mac, dst_ip, })), // Map an NDP Router Solicitation from the guest to a Router Advertisement // from the OPTE virtual gateway's link-local IPv6 address. Action::Hairpin(Arc::new(RouterAdvertisement::new( // From the guest's VPC MAC. - cfg.guest_mac, + ctx.cfg.guest_mac, // The MAC from which we respond, i.e., OPTE's MAC. - cfg.gateway_mac, + ctx.cfg.gateway_mac, // "Managed Configuration", indicating the guest needs to use DHCPv6 to // acquire an IPv6 address. true, @@ -71,9 +68,9 @@ pub fn setup( // per RFC 4861 so that the guest does not mark the neighbor failed. Action::Hairpin(Arc::new(NeighborAdvertisement::new( // From the guest's VPC MAC. - cfg.guest_mac, + ctx.cfg.guest_mac, // To OPTE's MAC. - cfg.gateway_mac, + ctx.cfg.gateway_mac, // Set the ROUTER flag to true. true, // Respond to solicitations from `::` @@ -84,11 +81,12 @@ pub fn setup( // UNWRAP SAFETY: There are far fewer than 65535 rules inserted here. let next_out_prio = u16::try_from(hairpins.len() + 1).unwrap(); // Add rules for the above actions. - hairpins.into_iter().enumerate().for_each(|(i, action)| { - let priority = u16::try_from(i + 1).unwrap(); - let rule = Rule::new(priority, action); - layer.add_rule(Direction::Out, rule.finalize()); - }); + ctx.out_rules.extend(hairpins.into_iter().enumerate().map( + |(i, action)| { + let priority = u16::try_from(i + 1).unwrap(); + Rule::new(priority, action).finalize() + }, + )); // Filter any uncaught in/out-bound NDP traffic. let pred = Predicate::Icmpv6MsgType(vec![ @@ -99,11 +97,11 @@ pub fn setup( let mut ndp_filter = Rule::new(next_out_prio, Action::Deny); ndp_filter.add_predicate(pred); - layer.add_rule(Direction::Out, ndp_filter.finalize()); + ctx.out_rules.push(ndp_filter.finalize()); let mut ndp_filter = Rule::new(1, Action::Deny); ndp_filter.add_predicate(in_pred); - layer.add_rule(Direction::In, ndp_filter.finalize()); + ctx.in_rules.push(ndp_filter.finalize()); Ok(()) } diff --git a/lib/oxide-vpc/src/engine/gateway/mod.rs b/lib/oxide-vpc/src/engine/gateway/mod.rs index b3ad7d4a..6aac9a6d 100644 --- a/lib/oxide-vpc/src/engine/gateway/mod.rs +++ b/lib/oxide-vpc/src/engine/gateway/mod.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -// Copyright 2024 Oxide Computer Company +// Copyright 2026 Oxide Computer Company //! The Oxide VPC Virtual Gateway. //! @@ -55,19 +55,22 @@ //! allow multicast packets to reach guests and rewrite the source MAC //! to the gateway MAC, similar to unicast traffic. -use crate::api::DhcpCfg; +use crate::api::AttachedSubnetConfig; use crate::api::MacAddr; +use crate::api::TransitIpConfig; use crate::cfg::Ipv4Cfg; use crate::cfg::Ipv6Cfg; use crate::cfg::VpcCfg; -use crate::engine::overlay::ACTION_META_VNI; +use crate::engine::overlay::VniTag; use crate::engine::overlay::VpcMappings; -use alloc::string::ToString; +use alloc::collections::BTreeMap; +use alloc::collections::BTreeSet; use alloc::sync::Arc; use alloc::vec::Vec; use core::fmt; use core::fmt::Display; use opte::api::Direction; +use opte::api::NoResp; use opte::api::OpteError; use opte::engine::ether::EtherMod; use opte::engine::headers::HeaderAction; @@ -78,6 +81,7 @@ use opte::engine::layer::Layer; use opte::engine::layer::LayerActions; use opte::engine::packet::InnerFlowId; use opte::engine::packet::MblkPacketData; +use opte::engine::port::Port; use opte::engine::port::PortBuilder; use opte::engine::port::Pos; use opte::engine::port::meta::ActionMeta; @@ -88,6 +92,7 @@ use opte::engine::predicate::Ipv6AddrMatch; use opte::engine::predicate::Predicate; use opte::engine::rule::Action; use opte::engine::rule::AllowOrDeny; +use opte::engine::rule::Finalized; use opte::engine::rule::GenHtResult; use opte::engine::rule::HdrTransform; use opte::engine::rule::MetaAction; @@ -103,14 +108,22 @@ pub mod icmpv6; mod transit; pub use transit::*; +use super::VpcNetwork; + pub const NAME: &str = "gateway"; +pub(crate) struct BuildCtx<'a> { + in_rules: Vec>, + out_rules: Vec>, + cfg: &'a VpcCfg, + vpc_meta: Arc, +} + pub fn setup( pb: &PortBuilder, cfg: &VpcCfg, vpc_mappings: Arc, ft_limit: core::num::NonZeroU32, - dhcp_cfg: &DhcpCfg, ) -> Result<(), OpteError> { // We implement the gateway as a filtering layer in order to // enforce that any traffic that makes it past this layer is @@ -128,23 +141,50 @@ pub fn setup( let mut layer = Layer::new(NAME, pb.name(), actions, ft_limit); + let mut ctx = BuildCtx { + in_rules: vec![], + out_rules: vec![], + cfg, + vpc_meta: Arc::new(VpcMeta::new(vpc_mappings)), + }; + if let Some(ipv4_cfg) = cfg.ipv4_cfg() { - setup_ipv4( - &mut layer, - cfg, - ipv4_cfg, - vpc_mappings.clone(), - dhcp_cfg.clone(), - )?; + setup_ipv4(&mut ctx, ipv4_cfg)?; } if let Some(ipv6_cfg) = cfg.ipv6_cfg() { - setup_ipv6(&mut layer, cfg, ipv6_cfg, vpc_mappings, dhcp_cfg.clone())?; + setup_ipv6(&mut ctx, ipv6_cfg)?; } + layer.set_rules(ctx.in_rules, ctx.out_rules); + pb.add_layer(layer, Pos::Before("firewall")) } +// Recreates the full set of gateway rules on a given port in response to a +// change to the set of transit IPs or overall `IpCfg`. +pub fn set_gateway_rules( + port: &Port, + vpc_mappings: Arc, +) -> Result { + let mut ctx = BuildCtx { + in_rules: vec![], + out_rules: vec![], + cfg: &port.network().cfg, + vpc_meta: Arc::new(VpcMeta::new(vpc_mappings)), + }; + + if let Some(ipv4_cfg) = ctx.cfg.ipv4_cfg() { + setup_ipv4(&mut ctx, ipv4_cfg)?; + } + + if let Some(ipv6_cfg) = ctx.cfg.ipv6_cfg() { + setup_ipv6(&mut ctx, ipv6_cfg)?; + } + + port.set_rules(NAME, ctx.in_rules, ctx.out_rules).map(|_| NoResp::default()) +} + struct RewriteSrcMac { gateway_mac: MacAddr, } @@ -177,18 +217,31 @@ impl StaticAction for RewriteSrcMac { } } -fn setup_ipv4( - layer: &mut Layer, - cfg: &VpcCfg, - ip_cfg: &Ipv4Cfg, - vpc_mappings: Arc, - dhcp_cfg: DhcpCfg, -) -> Result<(), OpteError> { - arp::setup(layer, cfg)?; - dhcp::setup(layer, cfg, ip_cfg, dhcp_cfg)?; - icmp::setup(layer, cfg, ip_cfg)?; +struct Exceptions<'a, T> { + allow_in: BTreeSet<&'a T>, + allow_out: BTreeSet<&'a T>, +} + +fn compute_exceptions<'a, T: Ord>( + attached: &'a BTreeMap, + transit: &'a BTreeMap, +) -> Exceptions<'a, T> { + let allow_in: BTreeSet<_> = attached + .keys() + .chain(transit.iter().filter_map(|(k, v)| v.allow_in.then_some(k))) + .collect(); + let allow_out: BTreeSet<_> = attached + .keys() + .chain(transit.iter().filter_map(|(k, v)| v.allow_out.then_some(k))) + .collect(); + + Exceptions { allow_in, allow_out } +} - let vpc_meta = Arc::new(VpcMeta::new(vpc_mappings)); +fn setup_ipv4(ctx: &mut BuildCtx, ip_cfg: &Ipv4Cfg) -> Result<(), OpteError> { + arp::setup(ctx)?; + dhcp::setup(ctx, ip_cfg)?; + icmp::setup(ctx, ip_cfg)?; // Outbound no-spoof rule: only allow traffic from the guest's IP and MAC. // This rule has no destination IP predicate, so it matches both unicast @@ -200,28 +253,28 @@ fn setup_ipv4( // unless the group is configured. In the future, we may want to explicitly // filter outbound multicast to only the groups configured via M2P to further // tighten spoof prevention at the gateway layer. - let mut nospoof_out = Rule::new(1000, Action::Meta(vpc_meta)); + let mut nospoof_out = Rule::new(1000, Action::Meta(ctx.vpc_meta.clone())); nospoof_out.add_predicate(Predicate::InnerSrcIp4(vec![ Ipv4AddrMatch::Exact(ip_cfg.private_ip), ])); nospoof_out.add_predicate(Predicate::InnerEtherSrc(vec![ - EtherAddrMatch::Exact(cfg.guest_mac), + EtherAddrMatch::Exact(ctx.cfg.guest_mac), ])); - layer.add_rule(Direction::Out, nospoof_out.finalize()); + ctx.out_rules.push(nospoof_out.finalize()); let mut unicast_in = Rule::new( 1000, Action::Static(Arc::new(RewriteSrcMac { - gateway_mac: cfg.gateway_mac, + gateway_mac: ctx.cfg.gateway_mac, })), ); unicast_in.add_predicate(Predicate::InnerDstIp4(vec![ Ipv4AddrMatch::Exact(ip_cfg.private_ip), ])); unicast_in.add_predicate(Predicate::InnerEtherDst(vec![ - EtherAddrMatch::Exact(cfg.guest_mac), + EtherAddrMatch::Exact(ctx.cfg.guest_mac), ])); - layer.add_rule(Direction::In, unicast_in.finalize()); + ctx.in_rules.push(unicast_in.finalize()); // Inbound IPv4 multicast - rewrite source MAC to gateway and allow let ipv4_mcast = vec![Ipv4AddrMatch::Prefix(Ipv4Cidr::MCAST)]; @@ -230,28 +283,43 @@ fn setup_ipv4( let mut mcast_in_v4 = Rule::new( 1001, Action::Static(Arc::new(RewriteSrcMac { - gateway_mac: cfg.gateway_mac, + gateway_mac: ctx.cfg.gateway_mac, })), ); mcast_in_v4.add_predicate(Predicate::InnerDstIp4(ipv4_mcast)); mcast_in_v4.add_predicate(Predicate::InnerEtherDst(vec![ EtherAddrMatch::Multicast, ])); - layer.add_rule(Direction::In, mcast_in_v4.finalize()); + ctx.in_rules.push(mcast_in_v4.finalize()); + + // Plumb in any required exceptions to spoof prevention/filtering. + let transit = ip_cfg.transit_ips.load(); + let attached = ip_cfg.attached_subnets.load(); + + let Exceptions { allow_in, allow_out } = + compute_exceptions(&attached, &transit); + + for (place, dir, from) in [ + (&mut ctx.in_rules, Direction::In, allow_in), + (&mut ctx.out_rules, Direction::Out, allow_out), + ] { + place.extend(from.into_iter().map(|cidr| { + make_holepunch_rule( + ctx.cfg.guest_mac, + ctx.cfg.gateway_mac, + (*cidr).into(), + dir, + &ctx.vpc_meta, + ) + })); + } Ok(()) } -fn setup_ipv6( - layer: &mut Layer, - cfg: &VpcCfg, - ip_cfg: &Ipv6Cfg, - vpc_mappings: Arc, - dhcp_cfg: DhcpCfg, -) -> Result<(), OpteError> { - icmpv6::setup(layer, cfg, ip_cfg)?; - dhcpv6::setup(layer, cfg, dhcp_cfg)?; - let vpc_meta = Arc::new(VpcMeta::new(vpc_mappings)); +fn setup_ipv6(ctx: &mut BuildCtx, ip_cfg: &Ipv6Cfg) -> Result<(), OpteError> { + icmpv6::setup(ctx, ip_cfg)?; + dhcpv6::setup(ctx)?; // Outbound no-spoof rule: only allow traffic from the guest's IP and MAC. // This rule has no destination IP predicate, so it matches both unicast @@ -263,42 +331,64 @@ fn setup_ipv6( // unless the group is configured. In the future, we may want to explicitly // filter outbound multicast to only the groups configured via M2P to further // tighten spoof prevention at the gateway layer. - let mut nospoof_out = Rule::new(1000, Action::Meta(vpc_meta)); + let mut nospoof_out = Rule::new(1000, Action::Meta(ctx.vpc_meta.clone())); nospoof_out.add_predicate(Predicate::InnerSrcIp6(vec![ Ipv6AddrMatch::Exact(ip_cfg.private_ip), ])); nospoof_out.add_predicate(Predicate::InnerEtherSrc(vec![ - EtherAddrMatch::Exact(cfg.guest_mac), + EtherAddrMatch::Exact(ctx.cfg.guest_mac), ])); - layer.add_rule(Direction::Out, nospoof_out.finalize()); + ctx.out_rules.push(nospoof_out.finalize()); let mut unicast_in = Rule::new( 1000, Action::Static(Arc::new(RewriteSrcMac { - gateway_mac: cfg.gateway_mac, + gateway_mac: ctx.cfg.gateway_mac, })), ); unicast_in.add_predicate(Predicate::InnerDstIp6(vec![ Ipv6AddrMatch::Exact(ip_cfg.private_ip), ])); unicast_in.add_predicate(Predicate::InnerEtherDst(vec![ - EtherAddrMatch::Exact(cfg.guest_mac), + EtherAddrMatch::Exact(ctx.cfg.guest_mac), ])); - layer.add_rule(Direction::In, unicast_in.finalize()); + ctx.in_rules.push(unicast_in.finalize()); // Inbound IPv6 multicast - rewrite source MAC to gateway and allow let ipv6_mcast = vec![Ipv6AddrMatch::Prefix(Ipv6Cidr::MCAST)]; let mut mcast_in = Rule::new( 1001, Action::Static(Arc::new(RewriteSrcMac { - gateway_mac: cfg.gateway_mac, + gateway_mac: ctx.cfg.gateway_mac, })), ); mcast_in.add_predicate(Predicate::InnerDstIp6(ipv6_mcast)); mcast_in.add_predicate(Predicate::InnerEtherDst(vec![ EtherAddrMatch::Multicast, ])); - layer.add_rule(Direction::In, mcast_in.finalize()); + ctx.in_rules.push(mcast_in.finalize()); + + // Plumb in any required exceptions to spoof prevention/filtering. + let transit = ip_cfg.transit_ips.load(); + let attached = ip_cfg.attached_subnets.load(); + + let Exceptions { allow_in, allow_out } = + compute_exceptions(&attached, &transit); + + for (place, dir, from) in [ + (&mut ctx.in_rules, Direction::In, allow_in), + (&mut ctx.out_rules, Direction::Out, allow_out), + ] { + place.extend(from.into_iter().map(|cidr| { + make_holepunch_rule( + ctx.cfg.guest_mac, + ctx.cfg.gateway_mac, + (*cidr).into(), + dir, + &ctx.vpc_meta, + ) + })); + } Ok(()) } @@ -307,7 +397,7 @@ fn setup_ipv6( /// /// This allows the outbound side of firewall layer to filter based on /// VPC. -struct VpcMeta { +pub(crate) struct VpcMeta { vpc_mappings: Arc, } @@ -325,8 +415,7 @@ impl MetaAction for VpcMeta { ) -> ModMetaResult { match self.vpc_mappings.ip_to_vni(&flow.dst_ip()) { Some(vni) => { - action_meta - .insert(ACTION_META_VNI.into(), vni.to_string().into()); + action_meta.insert_typed(&VniTag(vni)); Ok(AllowOrDeny::Allow(())) } diff --git a/lib/oxide-vpc/src/engine/gateway/transit.rs b/lib/oxide-vpc/src/engine/gateway/transit.rs index 9d58d3a1..b8f7011a 100644 --- a/lib/oxide-vpc/src/engine/gateway/transit.rs +++ b/lib/oxide-vpc/src/engine/gateway/transit.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -// Copyright 2024 Oxide Computer Company +// Copyright 2026 Oxide Computer Company //! Utility functions to allow a port to permit traffic on an //! additional set of CIDR blocks, e.g. to enable transit for @@ -10,18 +10,20 @@ use super::*; use crate::api::RemoveCidrResp; +use crate::cfg::IpCfg; use crate::engine::VpcNetwork; +use alloc::collections::btree_map::Entry; use opte::api::IpCidr; use opte::api::NoResp; use opte::engine::port::Port; use opte::engine::rule::Finalized; -fn make_holepunch_rule( +pub(crate) fn make_holepunch_rule( guest_mac: MacAddr, gateway_mac: MacAddr, dest: IpCidr, dir: Direction, - vpc_mappings: Arc, + vpc_meta: &Arc, ) -> Rule { let (cidr_in_pred, cidr_out_pred) = match dest { IpCidr::Ip4(v4) => ( @@ -48,7 +50,7 @@ fn make_holepunch_rule( cidr_in.finalize() } Direction::Out => { - let vpc_meta = Arc::new(VpcMeta::new(vpc_mappings)); + let vpc_meta = vpc_meta.clone(); let mut cidr_out = Rule::new(1000, Action::Meta(vpc_meta)); cidr_out.add_predicate(Predicate::InnerEtherSrc(vec![ EtherAddrMatch::Exact(guest_mac), @@ -68,15 +70,7 @@ pub fn allow_cidr( dir: Direction, vpc_mappings: Arc, ) -> Result { - let rule = make_holepunch_rule( - port.mac_addr(), - port.network().cfg.gateway_mac, - dest, - dir, - vpc_mappings, - ); - port.add_rule(NAME, dir, rule)?; - Ok(NoResp::default()) + modify_cidr(port, dest, dir, vpc_mappings, true).map(|_| NoResp::default()) } /// Prevents a guest from sending/receiving traffic on a CIDR block @@ -87,22 +81,70 @@ pub fn remove_cidr( dir: Direction, vpc_mappings: Arc, ) -> Result { - let rule = make_holepunch_rule( - port.mac_addr(), - port.network().cfg.gateway_mac, - dest, - dir, - vpc_mappings, - ); + modify_cidr(port, dest, dir, vpc_mappings, false).map(|changed| { + if changed { + RemoveCidrResp::Ok(dest) + } else { + RemoveCidrResp::NotFound + } + }) +} - let maybe_id = port.find_rule(NAME, dir, &rule)?; - if let Some(id) = maybe_id { - port.remove_rule(NAME, dir, id)?; +fn modify_cidr( + port: &Port, + dest: IpCidr, + dir: Direction, + vpc_mappings: Arc, + allow: bool, +) -> Result { + let mut existing = false; + let mut remove = false; + + match (&port.network().cfg.ip_cfg, dest) { + (IpCfg::Ipv4(ipv4), IpCidr::Ip4(ipv4_cidr)) + | (IpCfg::DualStack { ipv4, .. }, IpCidr::Ip4(ipv4_cidr)) => { + ipv4.transit_ips.update(|v| { + let mut new = v.clone(); + let el = new.entry(ipv4_cidr); + existing = matches!(el, Entry::Occupied(_)); + if allow || existing { + let el = el.or_default(); + match dir { + Direction::In => el.allow_in = allow, + Direction::Out => el.allow_out = allow, + } + remove = !allow && !el.allow_in && !el.allow_out; + } + if remove { + new.remove(&ipv4_cidr); + } + Some(new) + }); + } + (IpCfg::Ipv6(ipv6), IpCidr::Ip6(ipv6_cidr)) + | (IpCfg::DualStack { ipv6, .. }, IpCidr::Ip6(ipv6_cidr)) => { + ipv6.transit_ips.update(|v| { + let mut new = v.clone(); + let el = new.entry(ipv6_cidr); + existing = matches!(el, Entry::Occupied(_)); + if allow || existing { + let el = el.or_default(); + match dir { + Direction::In => el.allow_in = allow, + Direction::Out => el.allow_out = allow, + } + remove = !allow && !el.allow_in && !el.allow_out; + } + if remove { + new.remove(&ipv6_cidr); + } + Some(new) + }); + } + _ => return Err(OpteError::InvalidIpCfg), } - Ok(if maybe_id.is_none() { - RemoveCidrResp::NotFound - } else { - RemoveCidrResp::Ok(dest) - }) + super::set_gateway_rules(port, vpc_mappings)?; + + Ok(existing) } diff --git a/lib/oxide-vpc/src/engine/geneve.rs b/lib/oxide-vpc/src/engine/geneve.rs index f26a2fd1..19ed2b01 100644 --- a/lib/oxide-vpc/src/engine/geneve.rs +++ b/lib/oxide-vpc/src/engine/geneve.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -// Copyright 2025 Oxide Computer Company +// Copyright 2026 Oxide Computer Company //! Geneve option types specific to the Oxide VPC dataplane. //! @@ -532,7 +532,7 @@ mod test { // Build a minimal packet with just one Multicast option #[rustfmt::skip] - let buf = vec![ + let buf = [ // UDP source 0x1E, 0x61, // UDP dest diff --git a/lib/oxide-vpc/src/engine/nat.rs b/lib/oxide-vpc/src/engine/nat.rs index 9f39d66a..473b63ba 100644 --- a/lib/oxide-vpc/src/engine/nat.rs +++ b/lib/oxide-vpc/src/engine/nat.rs @@ -2,13 +2,19 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -// Copyright 2023 Oxide Computer Company +// Copyright 2026 Oxide Computer Company use super::VpcNetwork; +use super::gateway; +use super::overlay::VpcMappings; use super::router::ROUTER_LAYER_NAME; use super::router::RouterTargetClass; use super::router::RouterTargetInternal; +use crate::api::AttachSubnetReq; +use crate::api::DetachSubnetReq; +use crate::api::DetachSubnetResp; use crate::api::ExternalIpCfg; +use crate::api::InternetGatewayMap; use crate::api::SetExternalIpsReq; use crate::cfg::IpCfg; use crate::cfg::Ipv4Cfg; @@ -21,6 +27,7 @@ use alloc::sync::Arc; use alloc::vec::Vec; use core::num::NonZeroU32; use opte::api::IpAddr; +use opte::api::IpCidr; use opte::api::Ipv4Addr; use opte::api::Ipv6Addr; use opte::api::OpteError; @@ -30,6 +37,7 @@ use opte::engine::ether::ETHER_TYPE_IPV6; use opte::engine::layer::DefaultAction; use opte::engine::layer::Layer; use opte::engine::layer::LayerActions; +use opte::engine::nat::ExternalIpTagger; use opte::engine::nat::InboundNat; use opte::engine::nat::OutboundNat; use opte::engine::nat::VerifyAddr; @@ -49,6 +57,7 @@ use opte::engine::snat::SNat; use uuid::Uuid; pub const NAT_LAYER_NAME: &str = "nat"; +const EXTERNAL_ATTACHED_SUBNET_PRIORITY: u16 = 4; const FLOATING_ONE_TO_ONE_NAT_PRIORITY: u16 = 5; const EPHEMERAL_ONE_TO_ONE_NAT_PRIORITY: u16 = 10; const SNAT_PRIORITY: u16 = 100; @@ -115,25 +124,15 @@ pub fn setup( #[allow(clippy::type_complexity)] fn create_nat_rules( cfg: &VpcCfg, - inet_gw_map: Option>>, + inet_gw_map: Option<&InternetGatewayMap>, ) -> Result<(Vec>, Vec>), OpteError> { let mut in_rules = vec![]; let mut out_rules = vec![]; if let Some(ipv4_cfg) = cfg.ipv4_cfg() { - setup_ipv4_nat( - ipv4_cfg, - &mut in_rules, - &mut out_rules, - inet_gw_map.as_ref(), - )?; + setup_ipv4_nat(ipv4_cfg, &mut in_rules, &mut out_rules, inet_gw_map)?; } if let Some(ipv6_cfg) = cfg.ipv6_cfg() { - setup_ipv6_nat( - ipv6_cfg, - &mut in_rules, - &mut out_rules, - inet_gw_map.as_ref(), - )?; + setup_ipv6_nat(ipv6_cfg, &mut in_rules, &mut out_rules, inet_gw_map)?; } // Append an additional rule to drop any InternetGateway packets @@ -142,9 +141,8 @@ fn create_nat_rules( // internet gateways but have no valid source address on a selected // IGW. let mut out_igw_nat_miss = Rule::new(NO_EIP_PRIORITY, Action::Deny); - out_igw_nat_miss.add_predicate(Predicate::Meta( - RouterTargetClass::KEY.to_string(), - RouterTargetClass::InternetGateway.as_meta().into_owned(), + out_igw_nat_miss.add_predicate(Predicate::from_action_meta( + RouterTargetClass::InternetGateway, )); out_rules.push(out_igw_nat_miss.finalize()); @@ -166,6 +164,37 @@ fn setup_ipv4_nat( let in_nat = Arc::new(InboundNat::new(ip_cfg.private_ip, verifier.clone())); let external_cfg = ip_cfg.external_ips.load(); + let attached_subnets: Vec<_> = ip_cfg + .attached_subnets + .load() + .iter() + .filter_map(|(k, v)| v.is_external.then_some(Ipv4AddrMatch::Prefix(*k))) + .collect(); + + if !attached_subnets.is_empty() { + // Use of this rule implicitly requires that we have selected *an* + // InternetGateway routing target by the time we reach the overlay layer. + // Don't match on the RouterTargetClass as a predicate here, as we need + // to record that a known EIP was used as a source. + let mut out_subnet = Rule::new( + EXTERNAL_ATTACHED_SUBNET_PRIORITY, + Action::Meta(Arc::new(ExternalIpTagger)), + ); + out_subnet + .add_predicate(Predicate::InnerSrcIp4(attached_subnets.clone())); + out_rules.push(out_subnet.finalize()); + + // Inbound rules here aren't *strictly* necessary, as the control plane + // should not be assigning us EIPs which overlap with these subnets. + // We would then fall through to the default `Allow`. + // + // Install these as belts and braces, regardless. + let mut in_subnet = + Rule::new(EXTERNAL_ATTACHED_SUBNET_PRIORITY, Action::Allow); + in_subnet.add_predicate(Predicate::InnerDstIp4(attached_subnets)); + in_rules.push(in_subnet.finalize()); + } + // Outbound IP selection needs to be gated upon which internet gateway was // chosen during routing. // We need to partition FIPs into separate lists based on which internet gateway @@ -324,6 +353,37 @@ fn setup_ipv6_nat( let in_nat = Arc::new(InboundNat::new(ip_cfg.private_ip, verifier.clone())); let external_cfg = ip_cfg.external_ips.load(); + let attached_subnets: Vec<_> = ip_cfg + .attached_subnets + .load() + .iter() + .filter_map(|(k, v)| v.is_external.then_some(Ipv6AddrMatch::Prefix(*k))) + .collect(); + + if !attached_subnets.is_empty() { + // Use of this rule implicitly requires that we have selected *an* + // InternetGateway routing target by the time we reach the overlay layer. + // Don't match on the RouterTargetClass as a predicate here, as we need + // to record that a known EIP was used as a source. + let mut out_subnet = Rule::new( + EXTERNAL_ATTACHED_SUBNET_PRIORITY, + Action::Meta(Arc::new(ExternalIpTagger)), + ); + out_subnet + .add_predicate(Predicate::InnerSrcIp6(attached_subnets.clone())); + out_rules.push(out_subnet.finalize()); + + // Inbound rules here aren't *strictly* necessary, as the control plane + // should not be assigning us EIPs which overlap with these subnets. + // We would then fall through to the default `Allow`. + // + // Install these as belts and braces, regardless. + let mut in_subnet = + Rule::new(EXTERNAL_ATTACHED_SUBNET_PRIORITY, Action::Allow); + in_subnet.add_predicate(Predicate::InnerDstIp6(attached_subnets)); + in_rules.push(in_subnet.finalize()); + } + // See `setup_ipv4_nat` for an explanation on partitioning FIPs // by internet gateway ID. if !external_cfg.floating_ips.is_empty() { @@ -465,11 +525,11 @@ fn setup_ipv6_nat( Ok(()) } -pub fn set_nat_rules( - cfg: &VpcCfg, +pub fn set_external_ips( port: &Port, req: SetExternalIpsReq, ) -> Result<(), OpteError> { + let cfg = &port.network().cfg; // This procedure only holds one lock at a time: a `Dynamic`'s shared // space writelock, *or* the table lock via set_rules_soft. // The datapath will hold the table lock for processing, *and* the `Dynamic`'s @@ -497,6 +557,108 @@ pub fn set_nat_rules( _ => return Err(OpteError::InvalidIpCfg), } - let (in_rules, out_rules) = create_nat_rules(cfg, req.inet_gw_map)?; + refresh_nat_rules(port, req.inet_gw_map.as_ref()) +} + +pub fn attach_subnet( + port: &Port, + inet_gw_map: Option<&InternetGatewayMap>, + vpc_mappings: &Arc, + req: AttachSubnetReq, +) -> Result<(), OpteError> { + let cfg = &port.network().cfg; + let changed = match (req.cidr, &cfg.ip_cfg) { + (IpCidr::Ip4(v4), IpCfg::Ipv4(v4_cfg)) + | (IpCidr::Ip4(v4), IpCfg::DualStack { ipv4: v4_cfg, .. }) => { + v4_cfg.attached_subnets.update(|map| { + let install = if let Some(val) = map.get(&v4) { + val != &req.cfg + } else { + true + }; + install.then(|| { + let mut out = map.clone(); + out.insert(v4, req.cfg); + out + }) + }) + } + (IpCidr::Ip6(v6), IpCfg::Ipv6(v6_cfg)) + | (IpCidr::Ip6(v6), IpCfg::DualStack { ipv6: v6_cfg, .. }) => { + v6_cfg.attached_subnets.update(|map| { + let install = if let Some(val) = map.get(&v6) { + val != &req.cfg + } else { + true + }; + install.then(|| { + let mut out = map.clone(); + out.insert(v6, req.cfg); + out + }) + }) + } + // Trying to attach a CIDR class which this port cannot use. + _ => return Err(OpteError::InvalidIpCfg), + }; + + if changed { + refresh_nat_rules(port, inet_gw_map)?; + gateway::set_gateway_rules(port, vpc_mappings.clone())?; + } + + Ok(()) +} + +pub fn detach_subnet( + port: &Port, + inet_gw_map: Option<&InternetGatewayMap>, + vpc_mappings: &Arc, + req: DetachSubnetReq, +) -> Result { + let cfg = &port.network().cfg; + let changed = match (req.cidr, &cfg.ip_cfg) { + (IpCidr::Ip4(v4), IpCfg::Ipv4(v4_cfg)) + | (IpCidr::Ip4(v4), IpCfg::DualStack { ipv4: v4_cfg, .. }) => { + v4_cfg.attached_subnets.update(|map| { + map.contains_key(&v4).then(|| { + let mut out = map.clone(); + out.remove(&v4); + out + }) + }) + } + (IpCidr::Ip6(v6), IpCfg::Ipv6(v6_cfg)) + | (IpCidr::Ip6(v6), IpCfg::DualStack { ipv6: v6_cfg, .. }) => { + v6_cfg.attached_subnets.update(|map| { + map.contains_key(&v6).then(|| { + let mut out = map.clone(); + out.remove(&v6); + out + }) + }) + } + // Trying to attach a CIDR class which this port cannot use. + _ => return Err(OpteError::InvalidIpCfg), + }; + + if changed { + refresh_nat_rules(port, inet_gw_map)?; + gateway::set_gateway_rules(port, vpc_mappings.clone())?; + } + + Ok(if !changed { + DetachSubnetResp::NotFound + } else { + DetachSubnetResp::Ok(req.cidr) + }) +} + +fn refresh_nat_rules( + port: &Port, + inet_gw_map: Option<&InternetGatewayMap>, +) -> Result<(), OpteError> { + let cfg = &port.network().cfg; + let (in_rules, out_rules) = create_nat_rules(cfg, inet_gw_map)?; port.set_rules_soft(NAT_LAYER_NAME, in_rules, out_rules) } diff --git a/lib/oxide-vpc/src/engine/overlay.rs b/lib/oxide-vpc/src/engine/overlay.rs index e4a356e8..d42a97a1 100644 --- a/lib/oxide-vpc/src/engine/overlay.rs +++ b/lib/oxide-vpc/src/engine/overlay.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -// Copyright 2025 Oxide Computer Company +// Copyright 2026 Oxide Computer Company //! The Oxide Network VPC Overlay. //! @@ -24,10 +24,12 @@ use crate::engine::geneve::ValidOxideOption; use alloc::borrow::Cow; use alloc::collections::BTreeSet; use alloc::collections::btree_map::BTreeMap; +use alloc::string::String; use alloc::string::ToString; use alloc::sync::Arc; use alloc::vec::Vec; use core::fmt; +use core::str::FromStr; use opte::api::Direction; use opte::api::Ipv4Addr; use opte::api::Ipv4Cidr; @@ -58,6 +60,7 @@ use opte::engine::ip::v6::Ipv6Push; use opte::engine::layer::DefaultAction; use opte::engine::layer::Layer; use opte::engine::layer::LayerActions; +use opte::engine::nat::ExternalIpTag; use opte::engine::packet::InnerFlowId; use opte::engine::packet::MblkPacketData; use opte::engine::port::PortBuilder; @@ -268,28 +271,35 @@ impl StaticAction for EncapAction { // The router layer determines a RouterTarget and stores it in // the meta map. We need to map this virtual target to a // physical one. - let target_str = match action_meta.get(RouterTargetInternal::IP_KEY) - { - Some(val) => val, - None => { - return Err(GenHtError::Unexpected { - msg: "no RouterTarget metadata entry found".to_string(), - }); - } - }; + let target = action_meta + .get_typed::() + .map_err(|e| GenHtError::Unexpected { msg: e.to_string() })?; - let target = RouterTargetInternal::from_meta(target_str).map_err( - |e| GenHtError::Unexpected { - msg: format!( - "failed to parse metadata entry '{target_str}': {e}", - ), - }, - )?; + let sent_from_eip = + action_meta.get_typed::().is_ok(); + + let recipient = match target { + RouterTargetInternal::Ip(virt_ip) => virt_ip, + _ => dst_ip, + }; match target { + // Currently, traffic directed at either attached external subnets or + // the external IPs of any other port always go through the V2B table. + // This requires a hairpin through the customer network, but provides + // strong isolation which some customers require. + // + // In future we want this to be a tunable property of the VPC. In this + // case we would require an extra table/poptrie per VPC, containing all + // external CIDR blocks visible across the VPC. We would then: + // * resolve `recipient` against this table, pulling the address of the + // owner's primary NIC. + // * if found, resolve the primary NIC address against the V2P. + // * Possibly add the Geneve external packet tag to the packet, esp. if + // crossing VPC boundaries. RouterTargetInternal::InternetGateway(_) => { - match self.v2b.get(&dst_ip) { - Some(phys) => { + match self.v2b.get(&recipient) { + Some(phys) if sent_from_eip => { // Hash the packet onto a route target. This is a very // rudimentary mechanism. Should level-up to an ECMP // algorithm with well known statistical properties. @@ -309,13 +319,17 @@ impl StaticAction for EncapAction { false, ) } - None => return Ok(AllowOrDeny::Deny), + + // Sending traffic to boundary services *requires* that + // it is originated from an external IP. + _ => return Ok(AllowOrDeny::Deny), } } - RouterTargetInternal::Ip(virt_ip) => { - match self.v2p.get(&virt_ip) { - Some(phys) => ( + RouterTargetInternal::Ip(_) + | RouterTargetInternal::VpcSubnet(_) => { + match self.v2p.get(&recipient) { + Some(phys) if !sent_from_eip => ( true, PhysNet { ether: phys.ether, @@ -325,9 +339,14 @@ impl StaticAction for EncapAction { false, ), - // The router target has specified a VPC IP we do not - // currently know about; this could be for two - // reasons: + // We have either attempted to forward traffic to a + // private IP/subnet from an external IP, or we failed + // to lookup the intended VPC IP. + // + // The former case can only occur when the guest is + // sending traffic from an attached external subnet. + // + // The latter case could arise for two reasons: // // 1. No such IP currently exists in the guest's VPC. // @@ -339,39 +358,7 @@ impl StaticAction for EncapAction { // the control plane; rather we drop the packet. If we // are dealing with scenario (2), the control plane // should eventually provide us with a mapping. - None => return Ok(AllowOrDeny::Deny), - } - } - - RouterTargetInternal::VpcSubnet(_) => { - match self.v2p.get(&flow_id.dst_ip()) { - Some(phys) => ( - true, - PhysNet { - ether: phys.ether, - ip: phys.ip, - vni: self.vni, - }, - false, - ), - - // The guest is attempting to contact a VPC IP we - // do not currently know about; this could be for - // two reasons: - // - // 1. No such IP currently exists in the guest's VPC. - // - // 2. The destination IP exists in the guest's - // VPC, but we do not yet have a mapping for - // it. - // - // We cannot differentiate these cases from the - // point of view of this code without more - // information from the control plane; rather we - // drop the packet. If we are dealing with - // scenario (2), the control plane should - // eventually provide us with a mapping. - None => return Ok(AllowOrDeny::Deny), + _ => return Ok(AllowOrDeny::Deny), } } } @@ -527,6 +514,23 @@ impl StaticAction for EncapAction { } } +/// Tag a packet with the VNI it will be sent on, or that was recorded in +/// encapsulation. +#[derive(Debug)] +pub struct VniTag(pub Vni); + +impl ActionMetaValue for VniTag { + const KEY: &'static str = "vni"; + + fn as_meta(&self) -> Cow<'static, str> { + self.0.to_string().into() + } + + fn from_meta(s: &str) -> Result { + Vni::from_str(s).map_err(|e| e.to_string()).map(Self) + } +} + #[derive(Default)] pub struct DecapAction {} @@ -544,8 +548,6 @@ impl fmt::Display for DecapAction { } } -pub const ACTION_META_VNI: &str = "vni"; - impl StaticAction for DecapAction { fn gen_ht( &self, @@ -588,7 +590,7 @@ impl StaticAction for DecapAction { // switch during NAT -- if found, `oxide_external_packet` // is filled. if !is_external { - action_meta.insert(ACTION_META_VNI.into(), vni.to_string().into()); + action_meta.insert_typed(&VniTag(vni)); } Ok(AllowOrDeny::Allow(HdrTransform { @@ -640,10 +642,7 @@ impl MetaAction for MulticastVniValidator { } // Check VNI from action metadata (set by DecapAction) - if let Some(vni_str) = action_meta.get(ACTION_META_VNI) - && let Ok(vni_val) = vni_str.parse::() - && let Ok(pkt_vni) = Vni::new(vni_val) - { + if let Ok(VniTag(pkt_vni)) = action_meta.get_typed() { let mcast_vni = Vni::new(DEFAULT_MULTICAST_VNI).unwrap(); // Allow if VNI matches this VPC or fleet-wide multicast VNI if pkt_vni == self.my_vni || pkt_vni == mcast_vni { diff --git a/lib/oxide-vpc/src/engine/router.rs b/lib/oxide-vpc/src/engine/router.rs index 6f03f892..3f8b823d 100644 --- a/lib/oxide-vpc/src/engine/router.rs +++ b/lib/oxide-vpc/src/engine/router.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -// Copyright 2024 Oxide Computer Company +// Copyright 2026 Oxide Computer Company //! The Oxide Network VPC Router. //! @@ -68,17 +68,6 @@ pub enum RouterTargetInternal { } impl RouterTargetInternal { - pub const IP_KEY: &'static str = "router-target-ip"; - pub const GENERIC_META: &'static str = "ig"; - - pub fn generic_meta(&self) -> Cow<'static, str> { - Self::GENERIC_META.into() - } - - pub fn ip_key(&self) -> Cow<'static, str> { - Self::IP_KEY.into() - } - pub fn class(&self) -> RouterTargetClass { match self { RouterTargetInternal::InternetGateway(_) => { @@ -487,13 +476,8 @@ impl MetaAction for RouterAction { _flow_id: &InnerFlowId, meta: &mut ActionMeta, ) -> ModMetaResult { - // TODO: I don't think we need IP_KEY. - if let RouterTargetInternal::InternetGateway(_) = self.target { - meta.insert(self.target.key(), self.target.as_meta()); - } - meta.insert(self.target.ip_key(), self.target.as_meta()); - let rt_class = self.target.class(); - meta.insert(rt_class.key(), rt_class.as_meta()); + meta.insert_typed(&self.target); + meta.insert_typed(&self.target.class()); Ok(AllowOrDeny::Allow(())) } } diff --git a/lib/oxide-vpc/tests/firewall_tests.rs b/lib/oxide-vpc/tests/firewall_tests.rs index 0be752fe..2aeca0f6 100644 --- a/lib/oxide-vpc/tests/firewall_tests.rs +++ b/lib/oxide-vpc/tests/firewall_tests.rs @@ -8,7 +8,6 @@ use opte::ddi::mblk::MsgBlk; use opte_test_utils as common; use common::*; -use oxide_vpc::api::BOUNDARY_SERVICES_VNI; #[test] fn firewall_replace_rules() { @@ -348,11 +347,6 @@ fn firewall_external_inbound() { // // This will appear on the same VNI as guest. // ================================================================ - let bsvc_phys = TestIpPhys { - ip: BS_IP_ADDR, - mac: BS_MAC_ADDR, - vni: Vni::new(BOUNDARY_SERVICES_VNI).unwrap(), - }; let guest_phys = TestIpPhys { ip: g1_cfg.phys_ip, mac: g1_cfg.guest_mac, @@ -365,7 +359,7 @@ fn firewall_external_inbound() { g1_cfg.guest_mac, g1_cfg.ipv4().private_ip, ); - pkt1_m = encap_external(pkt1_m, bsvc_phys, guest_phys); + pkt1_m = encap_external(pkt1_m, *BSVC_PHYS, guest_phys); let pkt1 = parse_inbound(&mut pkt1_m, VpcParser {}).unwrap(); // ================================================================ diff --git a/lib/oxide-vpc/tests/integration_tests.rs b/lib/oxide-vpc/tests/integration_tests.rs index 7ff51ef6..a180ffdc 100644 --- a/lib/oxide-vpc/tests/integration_tests.rs +++ b/lib/oxide-vpc/tests/integration_tests.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -// Copyright 2025 Oxide Computer Company +// Copyright 2026 Oxide Computer Company //! Integration tests. //! @@ -60,7 +60,10 @@ use opte::ingot::types::HeaderParse; use opte::ingot::udp::Udp; use opte::ingot::udp::UdpRef; use opte_test_utils as common; +use oxide_vpc::api::AttachSubnetReq; +use oxide_vpc::api::AttachedSubnetConfig; use oxide_vpc::api::BOUNDARY_SERVICES_VNI; +use oxide_vpc::api::DetachSubnetReq; use oxide_vpc::api::ExternalIpCfg; use oxide_vpc::api::FirewallRule; use oxide_vpc::api::RouterClass; @@ -105,6 +108,8 @@ fn lab_cfg() -> VpcCfg { ephemeral_ip: None, floating_ips: vec![], }, + attached_subnets: BTreeMap::new(), + transit_ips: BTreeMap::new(), }); VpcCfg { ip_cfg, @@ -121,6 +126,7 @@ fn lab_cfg() -> VpcCfg { phys_ip: Ipv6Addr::from([ 0xFD00, 0x0000, 0x00F7, 0x0101, 0x0000, 0x0000, 0x0000, 0x0001, ]), + dhcp: base_dhcp_config(), } } @@ -1018,6 +1024,8 @@ fn multi_external_setup( ephemeral_ip: v4_eph, floating_ips: v4s[first_float..].to_vec(), }, + attached_subnets: BTreeMap::new(), + transit_ips: BTreeMap::new(), }, ipv6: Ipv6Cfg { vpc_subnet: "fd00::/64".parse().unwrap(), @@ -1031,6 +1039,8 @@ fn multi_external_setup( ephemeral_ip: v6_eph, floating_ips: v6s[first_float..].to_vec(), }, + attached_subnets: BTreeMap::new(), + transit_ips: BTreeMap::new(), }, }; @@ -1094,11 +1104,6 @@ fn check_external_ip_inbound_behaviour( ext_v4: &[Ipv4Addr], ext_v6: &[Ipv6Addr], ) { - let bsvc_phys = TestIpPhys { - ip: BS_IP_ADDR, - mac: BS_MAC_ADDR, - vni: Vni::new(BOUNDARY_SERVICES_VNI).unwrap(), - }; let g1_phys = TestIpPhys { ip: cfg.phys_ip, mac: cfg.guest_mac, vni: cfg.vni }; @@ -1129,7 +1134,7 @@ fn check_external_ip_inbound_behaviour( flow_port, 80, ); - let mut pkt1_m = encap_external(pkt1, bsvc_phys, g1_phys); + let mut pkt1_m = encap_external(pkt1, *BSVC_PHYS, g1_phys); let pkt1 = parse_inbound(&mut pkt1_m, VpcParser {}).unwrap(); let res = port.port.process(In, pkt1); @@ -1349,11 +1354,6 @@ fn external_ip_balanced_over_floating_ips() { #[test] fn external_ip_epoch_affinity_preserved() { let (mut g1, g1_cfg, ext_v4, ext_v6) = multi_external_ip_setup(2, true); - let bsvc_phys = TestIpPhys { - ip: BS_IP_ADDR, - mac: BS_MAC_ADDR, - vni: Vni::new(BOUNDARY_SERVICES_VNI).unwrap(), - }; let g1_phys = TestIpPhys { ip: g1_cfg.phys_ip, mac: g1_cfg.guest_mac, @@ -1405,7 +1405,7 @@ fn external_ip_epoch_affinity_preserved() { }; let pkt1 = http_syn2(BS_MAC_ADDR, partner_ip, g1_cfg.guest_mac, ext_ip); - let mut pkt1_m = encap_external(pkt1, bsvc_phys, g1_phys); + let mut pkt1_m = encap_external(pkt1, *BSVC_PHYS, g1_phys); let pkt1 = parse_inbound(&mut pkt1_m, VpcParser {}).unwrap(); let res = g1.port.process(In, pkt1); @@ -1425,7 +1425,7 @@ fn external_ip_epoch_affinity_preserved() { // Bumping epoch on other layers (e.g., firewall) is typically fine, // since that won't affect the internal flowtable for NAT. // ==================================================================== - nat::set_nat_rules(&g1.cfg, &g1.port, req.clone()).unwrap(); + nat::set_external_ips(&g1.port, req.clone()).unwrap(); update!(g1, ["incr:epoch", "set:nat.rules.in=4, nat.rules.out=7",]); // ================================================================ @@ -1499,7 +1499,7 @@ fn external_ip_reconfigurable() { // based on destination prefix. inet_gw_map: None, }; - nat::set_nat_rules(&g1.cfg, &g1.port, req).unwrap(); + nat::set_external_ips(&g1.port, req).unwrap(); update!( g1, [ @@ -1764,12 +1764,7 @@ fn snat_icmp_shared_echo_rewrite(dst_ip: IpAddr) { mac: g1_cfg.guest_mac, vni: g1_cfg.vni, }; - let bsvc_phys = TestIpPhys { - ip: BS_IP_ADDR, - mac: BS_MAC_ADDR, - vni: Vni::new(BOUNDARY_SERVICES_VNI).unwrap(), - }; - pkt2_m = encap_external(pkt2_m, bsvc_phys, g1_phys); + pkt2_m = encap_external(pkt2_m, *BSVC_PHYS, g1_phys); pcap.add_pkt(&pkt2_m); let pkt2 = parse_inbound(&mut pkt2_m, VpcParser {}).unwrap(); @@ -1824,7 +1819,7 @@ fn snat_icmp_shared_echo_rewrite(dst_ip: IpAddr) { &data[..], 2, ); - pkt4_m = encap_external(pkt4_m, bsvc_phys, g1_phys); + pkt4_m = encap_external(pkt4_m, *BSVC_PHYS, g1_phys); pcap.add_pkt(&pkt4_m); let pkt4 = parse_inbound(&mut pkt4_m, VpcParser {}).unwrap(); @@ -3720,6 +3715,8 @@ fn ephemeral_ip_preferred_over_snat_outbound() { ephemeral_ip: Some("10.60.1.20".parse().unwrap()), floating_ips: vec![], }, + attached_subnets: BTreeMap::new(), + transit_ips: BTreeMap::new(), }, ipv6: Ipv6Cfg { vpc_subnet: "fd00::/64".parse().unwrap(), @@ -3733,6 +3730,8 @@ fn ephemeral_ip_preferred_over_snat_outbound() { ephemeral_ip: None, floating_ips: vec![], }, + attached_subnets: BTreeMap::new(), + transit_ips: BTreeMap::new(), }, }; @@ -3812,6 +3811,8 @@ fn tcp_inbound() { ephemeral_ip: Some("10.60.1.20".parse().unwrap()), floating_ips: vec![], }, + attached_subnets: BTreeMap::new(), + transit_ips: BTreeMap::new(), }, ipv6: Ipv6Cfg { vpc_subnet: "fd00::/64".parse().unwrap(), @@ -3825,6 +3826,8 @@ fn tcp_inbound() { ephemeral_ip: None, floating_ips: vec![], }, + attached_subnets: BTreeMap::new(), + transit_ips: BTreeMap::new(), }, }; @@ -4383,6 +4386,300 @@ fn port_as_router_target() { let pkt2 = parse_inbound(&mut pkt2_m, VpcParser {}).unwrap(); let res = g1.port.process(In, pkt2); expect_modified!(res, pkt2_m); + + // Removing CIDR blocks should piecewise remove the gateway rules. + gateway::remove_cidr(&g2.port, cidr, Direction::In, g2.vpc_map.clone()) + .unwrap(); + update!(g2, ["incr:epoch", "decr:gateway.rules.in",]); + gateway::remove_cidr(&g2.port, cidr, Direction::Out, g2.vpc_map.clone()) + .unwrap(); + update!(g2, ["incr:epoch", "decr:gateway.rules.out",]); +} + +// RFD 599 defines two mechanisms relating to attaching subnets to +// instances: attached external and attached VPC subnets. +// Both of these require in/out exceptions in the gateway layer, but differ +// on some points: +// - Attached VPC subnets require the control plane to insert a system +// router rule mapping cidr(subnet)->primary_ip(instance) on all other ports. +// This is the moral equivalent of the `port_as_router_target` test above, +// without manual user configuration. What we want to test here is how they +// differ, and that rules do not interfere with transit IPs bound to the +// same blocks. +// - Attached external subnets should exempt any matching inbound traffic +// from undergoing NAT, and must ensure that outbound traffic cannot be +// directly sent to a VPC-private address. +#[test] +fn internal_attached_subnets() { + let g1_cfg = g1_cfg(); + let mut g1 = oxide_net_setup("g1_port", &g1_cfg, None, None); + g1.port.start(); + set!(g1, "port_state=running"); + + // Attach the subnet. + let cidr = "10.0.0.0/8".parse().unwrap(); + nat::attach_subnet( + &g1.port, + None, + &g1.vpc_map, + AttachSubnetReq { + port_name: g1.port.name().into(), + cidr, + cfg: AttachedSubnetConfig { is_external: false }, + }, + ) + .unwrap(); + + update!(g1, ["set:epoch=5", "incr:gateway.rules.in, gateway.rules.out",]); + + // Suppose there is another port (same subnet) on G1's node. + let partner_ip: Ipv4Addr = "172.30.0.6".parse().unwrap(); + g1.vpc_map.add(partner_ip.into(), g1_cfg.phys_addr()); + + let my_ip = "10.0.123.45".parse().unwrap(); + + let data = b"1234\0"; + + // We can receive traffic on this attached subnet. + let guest_phys = TestIpPhys { + ip: g1_cfg.phys_ip, + mac: g1_cfg.guest_mac, + vni: g1_cfg.vni, + }; + let partner_phys = TestIpPhys { + ip: g1_cfg.phys_ip, + mac: ox_vpc_mac([0xF0, 0x00, 0x66]), + vni: g1_cfg.vni, + }; + let mut pkt1_m = gen_icmpv4_echo_req( + partner_phys.mac, + g1_cfg.guest_mac, + partner_ip, + my_ip, + 7777, + 1, + data, + 1, + ); + pkt1_m = encap(pkt1_m, partner_phys, guest_phys); + + let pkt1 = parse_inbound(&mut pkt1_m, VpcParser {}).unwrap(); + let res = g1.port.process(In, pkt1); + expect_modified!(res, pkt1_m); + incr!( + g1, + [ + "firewall.flows.in, firewall.flows.out", + "stats.port.in_modified, stats.port.in_uft_miss, uft.in", + ] + ); + + // And we can send traffic from an arbitrary IP in the subnet. + let mut pkt2_m = gen_icmpv4_echo_reply( + g1_cfg.guest_mac, + g1_cfg.gateway_mac, + my_ip, + partner_ip, + 7777, + 1, + data, + 1, + ); + let pkt2 = parse_outbound(&mut pkt2_m, VpcParser {}).unwrap(); + let res = g1.port.process(Out, pkt2); + expect_modified!(res, pkt2_m); + incr!(g1, ["stats.port.out_modified, stats.port.out_uft_miss, uft.out",]); + + // Add/remove of an identical transit IP range should be a NO-OP. + // (`incr` here implicitly asserts that the gateway rule count is unchanged). + gateway::allow_cidr(&g1.port, cidr, Direction::In, g1.vpc_map.clone()) + .unwrap(); + gateway::allow_cidr(&g1.port, cidr, Direction::Out, g1.vpc_map.clone()) + .unwrap(); + incr!(g1, ["epoch, epoch"]); + gateway::remove_cidr(&g1.port, cidr, Direction::In, g1.vpc_map.clone()) + .unwrap(); + gateway::remove_cidr(&g1.port, cidr, Direction::Out, g1.vpc_map.clone()) + .unwrap(); + incr!(g1, ["epoch, epoch"]); + + // ...until we remove the attachment itself. + nat::detach_subnet( + &g1.port, + None, + &g1.vpc_map, + DetachSubnetReq { port_name: g1.port.name().into(), cidr }, + ) + .unwrap(); + update!(g1, ["set:epoch=11", "decr:gateway.rules.in, gateway.rules.out",]); +} + +#[test] +fn external_attached_subnets_dont_apply_nat() { + let g1_cfg = g1_cfg(); + let mut g1 = oxide_net_setup("g1_port", &g1_cfg, None, None); + + g1.port.start(); + set!(g1, "port_state=running"); + + // Attach the subnet. + nat::attach_subnet( + &g1.port, + None, + &g1.vpc_map, + AttachSubnetReq { + port_name: g1.port.name().into(), + cidr: "8.0.0.0/8".parse().unwrap(), + cfg: AttachedSubnetConfig { is_external: true }, + }, + ) + .unwrap(); + + update!( + g1, + [ + "set:epoch=5", + "incr:gateway.rules.in, gateway.rules.out", + "incr:nat.rules.in, nat.rules.out" + ] + ); + + // Add default route. + router::add_entry( + &g1.port, + IpCidr::Ip4("0.0.0.0/0".parse().unwrap()), + RouterTarget::InternetGateway(None), + RouterClass::System, + ) + .unwrap(); + incr!(g1, ["epoch", "router.rules.out"]); + + let my_ext_ip = "8.8.8.8".parse().unwrap(); + let partner_ip = "1.1.1.1".parse().unwrap(); + + let data = b"1234\0"; + + // Have the guest receive a packet on an external IP in its owned + // 8.0.0.0/8 range. + let guest_phys = TestIpPhys { + ip: g1_cfg.phys_ip, + mac: g1_cfg.guest_mac, + vni: g1_cfg.vni, + }; + let mut pkt1_m = gen_icmpv4_echo_req( + BS_MAC_ADDR, + g1_cfg.guest_mac, + partner_ip, + my_ext_ip, + 7777, + 1, + data, + 1, + ); + pkt1_m = encap_external(pkt1_m, *BSVC_PHYS, guest_phys); + + let pkt1 = parse_inbound(&mut pkt1_m, VpcParser {}).unwrap(); + let res = g1.port.process(In, pkt1); + expect_modified!(res, pkt1_m); + incr!( + g1, + [ + "firewall.flows.in, firewall.flows.out", + "stats.port.in_modified, stats.port.in_uft_miss, uft.in", + ] + ); + + // This packet must not have had it's source/dest IP addresses alltered. + let pkt1 = + parse_outbound(&mut pkt1_m, VpcParser {}).unwrap().to_full_meta(); + assert_eq!(pkt1.meta().inner_ip4().unwrap().source(), partner_ip); + assert_eq!(pkt1.meta().inner_ip4().unwrap().destination(), my_ext_ip); + + // A reply packet from the guest on these IPs should also be unchanged, + // and must be directed at boundary services. + let mut pkt2_m = gen_icmpv4_echo_reply( + g1_cfg.guest_mac, + g1_cfg.gateway_mac, + my_ext_ip, + partner_ip, + 7777, + 1, + data, + 1, + ); + let pkt2 = parse_outbound(&mut pkt2_m, VpcParser {}).unwrap(); + let res = g1.port.process(Out, pkt2); + expect_modified!(res, pkt2_m); + incr!(g1, ["stats.port.out_modified, stats.port.out_uft_miss, uft.out",]); + let pkt2 = parse_inbound(&mut pkt2_m, VpcParser {}).unwrap().to_full_meta(); + let L3::Ipv6(outer_ip6) = pkt2.meta().outer_ip().unwrap() else { + panic!("Encapsulation must be IPv6."); + }; + assert_eq!(outer_ip6.source(), g1_cfg.phys_ip); + assert_eq!(outer_ip6.destination(), BSVC_PHYS.ip); + assert_eq!(pkt2.meta().inner_ip4().unwrap().source(), my_ext_ip); + assert_eq!(pkt2.meta().inner_ip4().unwrap().destination(), partner_ip); +} + +#[test] +fn external_attached_subnets_cannot_reach_internal() { + let g1_cfg = g1_cfg(); + let mut g1 = oxide_net_setup("g1_port", &g1_cfg, None, None); + + g1.port.start(); + set!(g1, "port_state=running"); + + // Attach the subnet. + nat::attach_subnet( + &g1.port, + None, + &g1.vpc_map, + AttachSubnetReq { + port_name: g1.port.name().into(), + cidr: "8.0.0.0/8".parse().unwrap(), + cfg: AttachedSubnetConfig { is_external: true }, + }, + ) + .unwrap(); + + update!( + g1, + [ + "set:epoch=5", + "incr:gateway.rules.in, gateway.rules.out", + "incr:nat.rules.in, nat.rules.out" + ] + ); + + // Suppose there is another port (same subnet) on G1's node. + let partner_ip: Ipv4Addr = "172.30.0.6".parse().unwrap(); + g1.vpc_map.add(partner_ip.into(), g1_cfg.phys_addr()); + + let my_ext_ip = "8.8.8.8".parse().unwrap(); + + let data = b"1234\0"; + + // Have the guest attempt to sent a packet from an external IP in its owned + // 8.0.0.0/8 range to a VPC-private address. As the source address is + // logically outside of the VPC-private scope we need to refuse to select a + // V2P mapping. + let mut pkt1_m = gen_icmpv4_echo_req( + g1_cfg.guest_mac, + g1_cfg.gateway_mac, + my_ext_ip, + partner_ip, + 7777, + 1, + data, + 1, + ); + + let pkt1 = parse_outbound(&mut pkt1_m, VpcParser {}).unwrap(); + let res = g1.port.process(Out, pkt1); + assert_drop!( + res, + DropReason::Layer { name: "overlay", reason: DenyReason::Action } + ); } #[test] @@ -4433,6 +4730,8 @@ fn select_eip_conditioned_on_igw() { "192.168.0.4".parse().unwrap(), ], }, + attached_subnets: BTreeMap::new(), + transit_ips: BTreeMap::new(), }, // Not really testing V6 here. Same principles apply. ipv6: Ipv6Cfg { @@ -4447,6 +4746,8 @@ fn select_eip_conditioned_on_igw() { ephemeral_ip: None, floating_ips: vec![], }, + attached_subnets: BTreeMap::new(), + transit_ips: BTreeMap::new(), }, }; @@ -4531,7 +4832,7 @@ fn select_eip_conditioned_on_igw() { // enables the limiting we aim to test here. inet_gw_map: Some(inet_gw_map), }; - nat::set_nat_rules(&g1.cfg, &g1.port, req).unwrap(); + nat::set_external_ips(&g1.port, req).unwrap(); update!(g1, ["incr:epoch", "set:nat.rules.out=8"]); // Send an ICMP packet for each destination, and verify that the @@ -4794,16 +5095,10 @@ fn icmpv6_inner_has_nat_applied() { ..Default::default() }; - let bsvc_phys = TestIpPhys { - ip: BS_IP_ADDR, - mac: BS_MAC_ADDR, - vni: Vni::new(BOUNDARY_SERVICES_VNI).unwrap(), - }; - let pkt_m = MsgBlk::new_ethernet_pkt((ð, &ip, &body_bytes)); let mut pkt_m = encap_external( pkt_m, - bsvc_phys, + *BSVC_PHYS, TestIpPhys { ip: g1_cfg.phys_ip, mac: g1_cfg.guest_mac, diff --git a/xde-tests/src/lib.rs b/xde-tests/src/lib.rs index f97a99fe..784ed161 100644 --- a/xde-tests/src/lib.rs +++ b/xde-tests/src/lib.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -// Copyright 2025 Oxide Computer Company +// Copyright 2026 Oxide Computer Company use anyhow::Result; use anyhow::anyhow; @@ -44,6 +44,7 @@ use oxide_vpc::api::Vni; use oxide_vpc::api::VpcCfg; use rand::Rng; use std::cell::RefCell; +use std::collections::BTreeMap; use std::collections::HashSet; use std::process::Child; use std::process::Command; @@ -300,14 +301,17 @@ impl OptePort { ephemeral_ip: None, floating_ips: vec![], }, + attached_subnets: BTreeMap::default(), + transit_ips: BTreeMap::default(), }), guest_mac: guest_mac.parse().unwrap(), gateway_mac: "a8:40:25:00:00:01".parse().unwrap(), vni: Vni::new(DEFAULT_MULTICAST_VNI).unwrap(), phys_ip: phys_ip.parse().unwrap(), + dhcp: DhcpCfg::default(), }; let adm = OpteHdl::open()?; - adm.create_xde(name, cfg.clone(), DhcpCfg::default(), false)?; + adm.create_xde(name, cfg.clone(), false)?; Ok(OptePort { name: name.into(), cfg, @@ -337,6 +341,8 @@ impl OptePort { ephemeral_ip: None, floating_ips: vec![], }, + attached_subnets: BTreeMap::default(), + transit_ips: BTreeMap::default(), }, ipv6: Ipv6Cfg { vpc_subnet: OVERLAY_NET_V6.parse().unwrap(), @@ -350,15 +356,18 @@ impl OptePort { ephemeral_ip: None, floating_ips: vec![], }, + attached_subnets: BTreeMap::default(), + transit_ips: BTreeMap::default(), }, }, guest_mac: guest_mac.parse().unwrap(), gateway_mac: "a8:40:25:00:00:01".parse().unwrap(), vni: Vni::new(DEFAULT_MULTICAST_VNI).unwrap(), phys_ip: phys_ip.parse().unwrap(), + dhcp: DhcpCfg::default(), }; let adm = OpteHdl::open()?; - adm.create_xde(name, cfg.clone(), DhcpCfg::default(), false)?; + adm.create_xde(name, cfg.clone(), false)?; Ok(OptePort { name: name.into(), cfg, diff --git a/xde/src/xde.rs b/xde/src/xde.rs index 576f3c24..29088871 100644 --- a/xde/src/xde.rs +++ b/xde/src/xde.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -// Copyright 2025 Oxide Computer Company +// Copyright 2026 Oxide Computer Company //! xde - A mac provider for OPTE. //! @@ -277,11 +277,12 @@ use oxide_vpc::api::DEFAULT_MULTICAST_VNI; use oxide_vpc::api::DelRouterEntryReq; use oxide_vpc::api::DelRouterEntryResp; use oxide_vpc::api::DeleteXdeReq; -use oxide_vpc::api::DhcpCfg; +use oxide_vpc::api::DetachSubnetResp; use oxide_vpc::api::DumpMcastForwardingResp; use oxide_vpc::api::DumpMcastSubscriptionsResp; use oxide_vpc::api::DumpVirt2BoundaryResp; use oxide_vpc::api::DumpVirt2PhysResp; +use oxide_vpc::api::InternetGatewayMap; use oxide_vpc::api::ListPortsResp; use oxide_vpc::api::McastForwardingEntry; use oxide_vpc::api::McastSubscribeReq; @@ -593,8 +594,8 @@ pub struct XdeDev { // could setup ports for any number of network implementations. // However, that's not where things are today. pub port: Arc>, - vpc_cfg: VpcCfg, port_v2p: Arc, + port_igw_map: KMutex>, // Pass the packets through to the underlay devices, skipping // opte-core processing. @@ -631,6 +632,10 @@ impl XdeDev { unsafe { mac::mac_rx(self.mh, ptr::null_mut(), pkt.as_ptr()) } } } + + pub fn vpc_cfg(&self) -> &VpcCfg { + &self.port.network().cfg + } } // SAFETY: The sole pointer member (the mac handle) safely supports @@ -1024,6 +1029,16 @@ unsafe extern "C" fn xde_ioc_opte_cmd(karg: *mut c_void, mode: c_int) -> c_int { hdlr_resp(&mut env, resp) } + OpteCmd::AttachSubnet => { + let resp = attach_subnet_hdlr(&mut env); + hdlr_resp(&mut env, resp) + } + + OpteCmd::DetachSubnet => { + let resp = detach_subnet_hdlr(&mut env); + hdlr_resp(&mut env, resp) + } + OpteCmd::SetMcastForwarding => { let resp = set_mcast_forwarding_hdlr(&mut env); hdlr_resp(&mut env, resp) @@ -1160,11 +1175,10 @@ fn create_xde(req: &CreateXdeReq) -> Result { port_v2p.clone(), state.v2b.clone(), state.ectx.clone(), - &req.dhcp, )?, port_v2p, vni: cfg.vni, - vpc_cfg: cfg, + port_igw_map: KMutex::new(None), passthrough: req.passthrough, u1, u2, @@ -1352,7 +1366,7 @@ fn delete_xde(req: &DeleteXdeReq) -> Result { } // Remove the VPC mappings for this port. - let cfg = &xde.vpc_cfg; + let cfg = xde.vpc_cfg(); let phys_net = PhysNet { ether: cfg.guest_mac, ip: cfg.phys_ip, vni: cfg.vni }; match cfg.ip_cfg { @@ -3140,7 +3154,6 @@ fn new_port( v2p: Arc, v2b: Arc, ectx: Arc, - dhcp_cfg: &DhcpCfg, ) -> Result>, OpteError> { let cfg = cfg.clone(); let name_cstr = match CString::new(name.as_str()) { @@ -3157,7 +3170,7 @@ fn new_port( // XXX some layers have no need for LFT, perhaps have two types // of Layer: one with, one without? - gateway::setup(&pb, &cfg, vpc_map.clone(), FT_LIMIT_ONE, dhcp_cfg)?; + gateway::setup(&pb, &cfg, vpc_map.clone(), FT_LIMIT_ONE)?; router::setup(&pb, &cfg, FT_LIMIT_ONE)?; nat::setup(&mut pb, &cfg, nat_ft_limit)?; overlay::setup(&pb, &cfg, v2p, m2p, v2b, FT_LIMIT_ONE)?; @@ -4205,7 +4218,13 @@ fn set_external_ips_hdlr(env: &mut IoctlEnvelope) -> Result { .get_by_name(&req.port_name) .ok_or_else(|| OpteError::PortNotFound(req.port_name.clone()))?; - nat::set_nat_rules(&dev.vpc_cfg, &dev.port, req)?; + { + let mut igw_map_lock = dev.port_igw_map.lock(); + *igw_map_lock = req.inet_gw_map.clone(); + + nat::set_external_ips(&dev.port, req)?; + } + Ok(NoResp::default()) } @@ -4236,27 +4255,56 @@ fn remove_cidr_hdlr( gateway::remove_cidr(&dev.port, req.cidr, req.dir, state.vpc_map.clone()) } +#[unsafe(no_mangle)] +fn attach_subnet_hdlr(env: &mut IoctlEnvelope) -> Result { + let req: oxide_vpc::api::AttachSubnetReq = env.copy_in_req()?; + let state = get_xde_state(); + let devs = state.devs.read(); + let dev = devs + .get_by_name(&req.port_name) + .ok_or_else(|| OpteError::PortNotFound(req.port_name.clone()))?; + + let igw_map_lock = dev.port_igw_map.lock(); + nat::attach_subnet(&dev.port, igw_map_lock.as_ref(), &state.vpc_map, req)?; + + Ok(NoResp::default()) +} + +#[unsafe(no_mangle)] +fn detach_subnet_hdlr( + env: &mut IoctlEnvelope, +) -> Result { + let req: oxide_vpc::api::DetachSubnetReq = env.copy_in_req()?; + let state = get_xde_state(); + let devs = state.devs.read(); + let dev = devs + .get_by_name(&req.port_name) + .ok_or_else(|| OpteError::PortNotFound(req.port_name.clone()))?; + + let igw_map_lock = dev.port_igw_map.lock(); + nat::detach_subnet(&dev.port, igw_map_lock.as_ref(), &state.vpc_map, req) +} + #[unsafe(no_mangle)] fn list_ports_hdlr() -> Result { let mut resp = ListPortsResp { ports: vec![] }; let state = get_xde_state(); let devs = state.devs.read(); for dev in devs.iter() { - let ipv4_state = - dev.vpc_cfg.ipv4_cfg().map(|cfg| cfg.external_ips.load()); - let ipv6_state = - dev.vpc_cfg.ipv6_cfg().map(|cfg| cfg.external_ips.load()); + let cfg = dev.vpc_cfg(); + let ipv4_state = cfg.ipv4_cfg().map(|cfg| cfg.external_ips.load()); + let ipv6_state = cfg.ipv6_cfg().map(|cfg| cfg.external_ips.load()); resp.ports.push(PortInfo { name: dev.port.name().to_string(), mac_addr: dev.port.mac_addr(), - ip4_addr: dev.vpc_cfg.ipv4_cfg().map(|cfg| cfg.private_ip), + ip4_addr: cfg.ipv4_cfg().map(|cfg| cfg.private_ip), ephemeral_ip4_addr: ipv4_state .as_ref() .and_then(|cfg| cfg.ephemeral_ip), floating_ip4_addrs: ipv4_state .as_ref() .map(|cfg| cfg.floating_ips.clone()), - ip6_addr: dev.vpc_cfg.ipv6_cfg().map(|cfg| cfg.private_ip), + ip6_addr: cfg.ipv6_cfg().map(|cfg| cfg.private_ip), ephemeral_ip6_addr: ipv6_state .as_ref() .and_then(|cfg| cfg.ephemeral_ip), From eb158882ae4497e71dccd1dba58fc1e931b68674 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Tue, 6 Jan 2026 13:43:35 +0000 Subject: [PATCH 2/4] Self-review --- lib/opte/src/engine/port/meta.rs | 2 +- lib/oxide-vpc/src/api.rs | 4 ++-- lib/oxide-vpc/src/cfg.rs | 2 +- lib/oxide-vpc/src/engine/gateway/arp.rs | 2 +- lib/oxide-vpc/src/engine/gateway/dhcp.rs | 2 +- lib/oxide-vpc/src/engine/gateway/dhcpv6.rs | 2 +- lib/oxide-vpc/src/engine/gateway/icmp.rs | 2 +- lib/oxide-vpc/src/engine/gateway/icmpv6.rs | 2 +- lib/oxide-vpc/src/engine/gateway/mod.rs | 4 ++-- lib/oxide-vpc/src/engine/gateway/transit.rs | 5 ++--- lib/oxide-vpc/src/engine/overlay.rs | 9 +++++---- lib/oxide-vpc/tests/integration_tests.rs | 22 ++++++++++----------- xde/src/xde.rs | 8 +++----- 13 files changed, 32 insertions(+), 34 deletions(-) diff --git a/lib/opte/src/engine/port/meta.rs b/lib/opte/src/engine/port/meta.rs index 17d673b4..b6f92413 100644 --- a/lib/opte/src/engine/port/meta.rs +++ b/lib/opte/src/engine/port/meta.rs @@ -100,7 +100,7 @@ impl ActionMeta { self.inner.get(key).map(|v| &**v) } - /// Get a reference to the value at a well known key key, or `None` + /// Get a reference to the value at a well known key for `T`, or `None` /// if no such entry exists. pub fn get_typed( &self, diff --git a/lib/oxide-vpc/src/api.rs b/lib/oxide-vpc/src/api.rs index dda0cbdb..d8a49f17 100644 --- a/lib/oxide-vpc/src/api.rs +++ b/lib/oxide-vpc/src/api.rs @@ -141,7 +141,7 @@ pub struct AttachedSubnetConfig { pub is_external: bool, } -/// Configuration for an exceptions to source/destination address filtering. +/// Configuration for an exception to source/destination address filtering. #[derive(Debug, Clone, Serialize, Deserialize, Default)] pub struct TransitIpConfig { /// Allow inbound traffic with a destination IP in the target CIDR. @@ -299,7 +299,7 @@ pub struct VpcCfg { /// sourced to a single IPv6 address. pub phys_ip: Ipv6Addr, - /// Configuration for DHCP responses created by OPTE + /// Configuration for DHCP responses created by OPTE. pub dhcp: DhcpCfg, } diff --git a/lib/oxide-vpc/src/cfg.rs b/lib/oxide-vpc/src/cfg.rs index 20122f42..e0377515 100644 --- a/lib/oxide-vpc/src/cfg.rs +++ b/lib/oxide-vpc/src/cfg.rs @@ -108,7 +108,7 @@ pub struct VpcCfg { /// sourced to a single IPv6 address. pub phys_ip: Ipv6Addr, - /// Configuration for DHCP responses created by OPTE + /// Configuration for DHCP responses created by OPTE. pub dhcp: DhcpCfg, } diff --git a/lib/oxide-vpc/src/engine/gateway/arp.rs b/lib/oxide-vpc/src/engine/gateway/arp.rs index 808af5b8..c71b8e93 100644 --- a/lib/oxide-vpc/src/engine/gateway/arp.rs +++ b/lib/oxide-vpc/src/engine/gateway/arp.rs @@ -16,7 +16,7 @@ use opte::engine::predicate::Predicate; use opte::engine::rule::Action; use opte::engine::rule::Rule; -pub(crate) fn setup(ctx: &mut BuildCtx) -> Result<(), OpteError> { +pub(super) fn setup(ctx: &mut BuildCtx) -> Result<(), OpteError> { // ================================================================ // Outbound ARP Request for Gateway, from Guest // diff --git a/lib/oxide-vpc/src/engine/gateway/dhcp.rs b/lib/oxide-vpc/src/engine/gateway/dhcp.rs index 6f3f82f4..c834e101 100644 --- a/lib/oxide-vpc/src/engine/gateway/dhcp.rs +++ b/lib/oxide-vpc/src/engine/gateway/dhcp.rs @@ -19,7 +19,7 @@ use opte::engine::ip::v4::Ipv4Cidr; use opte::engine::rule::Action; use opte::engine::rule::Rule; -pub(crate) fn setup( +pub(super) fn setup( ctx: &mut BuildCtx, ip_cfg: &Ipv4Cfg, ) -> Result<(), OpteError> { diff --git a/lib/oxide-vpc/src/engine/gateway/dhcpv6.rs b/lib/oxide-vpc/src/engine/gateway/dhcpv6.rs index 1fbaebdd..273be2b3 100644 --- a/lib/oxide-vpc/src/engine/gateway/dhcpv6.rs +++ b/lib/oxide-vpc/src/engine/gateway/dhcpv6.rs @@ -15,7 +15,7 @@ use opte::engine::dhcpv6::LeasedAddress; use opte::engine::rule::Action; use opte::engine::rule::Rule; -pub(crate) fn setup(ctx: &mut BuildCtx) -> Result<(), OpteError> { +pub(super) fn setup(ctx: &mut BuildCtx) -> Result<(), OpteError> { let ip_cfg = match ctx.cfg.ipv6_cfg() { None => return Ok(()), Some(ip_cfg) => ip_cfg, diff --git a/lib/oxide-vpc/src/engine/gateway/icmp.rs b/lib/oxide-vpc/src/engine/gateway/icmp.rs index f7094891..d9584332 100644 --- a/lib/oxide-vpc/src/engine/gateway/icmp.rs +++ b/lib/oxide-vpc/src/engine/gateway/icmp.rs @@ -14,7 +14,7 @@ use opte::engine::icmp::v4::IcmpEchoReply; use opte::engine::rule::Action; use opte::engine::rule::Rule; -pub(crate) fn setup( +pub(super) fn setup( ctx: &mut BuildCtx, ip_cfg: &Ipv4Cfg, ) -> Result<(), OpteError> { diff --git a/lib/oxide-vpc/src/engine/gateway/icmpv6.rs b/lib/oxide-vpc/src/engine/gateway/icmpv6.rs index 6e27cbac..6e2dbbb3 100644 --- a/lib/oxide-vpc/src/engine/gateway/icmpv6.rs +++ b/lib/oxide-vpc/src/engine/gateway/icmpv6.rs @@ -32,7 +32,7 @@ use smoltcp::wire::Icmpv6Message; // - Respond to NDP Neighbor Solicitations from the guest to the gateway. This // includes solicitations unicast to the gateway, and also delivered to the // solicited-node multicast group. -pub(crate) fn setup( +pub(super) fn setup( ctx: &mut BuildCtx, ip_cfg: &Ipv6Cfg, ) -> Result<(), OpteError> { diff --git a/lib/oxide-vpc/src/engine/gateway/mod.rs b/lib/oxide-vpc/src/engine/gateway/mod.rs index 6aac9a6d..e2f410e8 100644 --- a/lib/oxide-vpc/src/engine/gateway/mod.rs +++ b/lib/oxide-vpc/src/engine/gateway/mod.rs @@ -112,7 +112,7 @@ use super::VpcNetwork; pub const NAME: &str = "gateway"; -pub(crate) struct BuildCtx<'a> { +struct BuildCtx<'a> { in_rules: Vec>, out_rules: Vec>, cfg: &'a VpcCfg, @@ -397,7 +397,7 @@ fn setup_ipv6(ctx: &mut BuildCtx, ip_cfg: &Ipv6Cfg) -> Result<(), OpteError> { /// /// This allows the outbound side of firewall layer to filter based on /// VPC. -pub(crate) struct VpcMeta { +struct VpcMeta { vpc_mappings: Arc, } diff --git a/lib/oxide-vpc/src/engine/gateway/transit.rs b/lib/oxide-vpc/src/engine/gateway/transit.rs index b8f7011a..f8206003 100644 --- a/lib/oxide-vpc/src/engine/gateway/transit.rs +++ b/lib/oxide-vpc/src/engine/gateway/transit.rs @@ -18,7 +18,7 @@ use opte::api::NoResp; use opte::engine::port::Port; use opte::engine::rule::Finalized; -pub(crate) fn make_holepunch_rule( +pub(super) fn make_holepunch_rule( guest_mac: MacAddr, gateway_mac: MacAddr, dest: IpCidr, @@ -50,8 +50,7 @@ pub(crate) fn make_holepunch_rule( cidr_in.finalize() } Direction::Out => { - let vpc_meta = vpc_meta.clone(); - let mut cidr_out = Rule::new(1000, Action::Meta(vpc_meta)); + let mut cidr_out = Rule::new(1000, Action::Meta(vpc_meta.clone())); cidr_out.add_predicate(Predicate::InnerEtherSrc(vec![ EtherAddrMatch::Exact(guest_mac), ])); diff --git a/lib/oxide-vpc/src/engine/overlay.rs b/lib/oxide-vpc/src/engine/overlay.rs index d42a97a1..979ddd77 100644 --- a/lib/oxide-vpc/src/engine/overlay.rs +++ b/lib/oxide-vpc/src/engine/overlay.rs @@ -292,9 +292,10 @@ impl StaticAction for EncapAction { // In future we want this to be a tunable property of the VPC. In this // case we would require an extra table/poptrie per VPC, containing all // external CIDR blocks visible across the VPC. We would then: - // * resolve `recipient` against this table, pulling the address of the - // owner's primary NIC. - // * if found, resolve the primary NIC address against the V2P. + // * resolve `recipient` against this table when going via an IGW, + // pulling the address of the owner's primary NIC. + // * if found, resolve the primary NIC address against the V2P instead of + // the V2B. // * Possibly add the Geneve external packet tag to the packet, esp. if // crossing VPC boundaries. RouterTargetInternal::InternetGateway(_) => { @@ -517,7 +518,7 @@ impl StaticAction for EncapAction { /// Tag a packet with the VNI it will be sent on, or that was recorded in /// encapsulation. #[derive(Debug)] -pub struct VniTag(pub Vni); +pub(crate) struct VniTag(pub Vni); impl ActionMetaValue for VniTag { const KEY: &'static str = "vni"; diff --git a/lib/oxide-vpc/tests/integration_tests.rs b/lib/oxide-vpc/tests/integration_tests.rs index a180ffdc..c33039e2 100644 --- a/lib/oxide-vpc/tests/integration_tests.rs +++ b/lib/oxide-vpc/tests/integration_tests.rs @@ -1426,7 +1426,7 @@ fn external_ip_epoch_affinity_preserved() { // since that won't affect the internal flowtable for NAT. // ==================================================================== nat::set_external_ips(&g1.port, req.clone()).unwrap(); - update!(g1, ["incr:epoch", "set:nat.rules.in=4, nat.rules.out=7",]); + update!(g1, ["incr:epoch", "set:nat.rules.in=4, nat.rules.out=7"]); // ================================================================ // The reply packet must still originate from the ephemeral port @@ -4380,7 +4380,7 @@ fn port_as_router_target() { let pkt2 = parse_outbound(&mut pkt2_m, VpcParser {}).unwrap(); let res = g2.port.process(Out, pkt2); - incr!(g2, ["stats.port.out_modified, stats.port.out_uft_miss, uft.out",]); + incr!(g2, ["stats.port.out_modified, stats.port.out_uft_miss, uft.out"]); expect_modified!(res, pkt2_m); let pkt2 = parse_inbound(&mut pkt2_m, VpcParser {}).unwrap(); @@ -4390,10 +4390,10 @@ fn port_as_router_target() { // Removing CIDR blocks should piecewise remove the gateway rules. gateway::remove_cidr(&g2.port, cidr, Direction::In, g2.vpc_map.clone()) .unwrap(); - update!(g2, ["incr:epoch", "decr:gateway.rules.in",]); + update!(g2, ["incr:epoch", "decr:gateway.rules.in"]); gateway::remove_cidr(&g2.port, cidr, Direction::Out, g2.vpc_map.clone()) .unwrap(); - update!(g2, ["incr:epoch", "decr:gateway.rules.out",]); + update!(g2, ["incr:epoch", "decr:gateway.rules.out"]); } // RFD 599 defines two mechanisms relating to attaching subnets to @@ -4430,9 +4430,9 @@ fn internal_attached_subnets() { ) .unwrap(); - update!(g1, ["set:epoch=5", "incr:gateway.rules.in, gateway.rules.out",]); + update!(g1, ["set:epoch=5", "incr:gateway.rules.in, gateway.rules.out"]); - // Suppose there is another port (same subnet) on G1's node. + // Suppose there is another port (same non-attached subnet) on G1's node. let partner_ip: Ipv4Addr = "172.30.0.6".parse().unwrap(); g1.vpc_map.add(partner_ip.into(), g1_cfg.phys_addr()); @@ -4488,7 +4488,7 @@ fn internal_attached_subnets() { let pkt2 = parse_outbound(&mut pkt2_m, VpcParser {}).unwrap(); let res = g1.port.process(Out, pkt2); expect_modified!(res, pkt2_m); - incr!(g1, ["stats.port.out_modified, stats.port.out_uft_miss, uft.out",]); + incr!(g1, ["stats.port.out_modified, stats.port.out_uft_miss, uft.out"]); // Add/remove of an identical transit IP range should be a NO-OP. // (`incr` here implicitly asserts that the gateway rule count is unchanged). @@ -4511,7 +4511,7 @@ fn internal_attached_subnets() { DetachSubnetReq { port_name: g1.port.name().into(), cidr }, ) .unwrap(); - update!(g1, ["set:epoch=11", "decr:gateway.rules.in, gateway.rules.out",]); + update!(g1, ["set:epoch=11", "decr:gateway.rules.in, gateway.rules.out"]); } #[test] @@ -4589,7 +4589,7 @@ fn external_attached_subnets_dont_apply_nat() { ] ); - // This packet must not have had it's source/dest IP addresses alltered. + // This packet must not have had its source/dest IP addresses altered. let pkt1 = parse_outbound(&mut pkt1_m, VpcParser {}).unwrap().to_full_meta(); assert_eq!(pkt1.meta().inner_ip4().unwrap().source(), partner_ip); @@ -4610,7 +4610,7 @@ fn external_attached_subnets_dont_apply_nat() { let pkt2 = parse_outbound(&mut pkt2_m, VpcParser {}).unwrap(); let res = g1.port.process(Out, pkt2); expect_modified!(res, pkt2_m); - incr!(g1, ["stats.port.out_modified, stats.port.out_uft_miss, uft.out",]); + incr!(g1, ["stats.port.out_modified, stats.port.out_uft_miss, uft.out"]); let pkt2 = parse_inbound(&mut pkt2_m, VpcParser {}).unwrap().to_full_meta(); let L3::Ipv6(outer_ip6) = pkt2.meta().outer_ip().unwrap() else { panic!("Encapsulation must be IPv6."); @@ -4651,7 +4651,7 @@ fn external_attached_subnets_cannot_reach_internal() { ] ); - // Suppose there is another port (same subnet) on G1's node. + // Suppose there is another port (same non-attached subnet) on G1's node. let partner_ip: Ipv4Addr = "172.30.0.6".parse().unwrap(); g1.vpc_map.add(partner_ip.into(), g1_cfg.phys_addr()); diff --git a/xde/src/xde.rs b/xde/src/xde.rs index 29088871..e7653b13 100644 --- a/xde/src/xde.rs +++ b/xde/src/xde.rs @@ -4218,12 +4218,10 @@ fn set_external_ips_hdlr(env: &mut IoctlEnvelope) -> Result { .get_by_name(&req.port_name) .ok_or_else(|| OpteError::PortNotFound(req.port_name.clone()))?; - { - let mut igw_map_lock = dev.port_igw_map.lock(); - *igw_map_lock = req.inet_gw_map.clone(); + let mut igw_map_lock = dev.port_igw_map.lock(); + *igw_map_lock = req.inet_gw_map.clone(); - nat::set_external_ips(&dev.port, req)?; - } + nat::set_external_ips(&dev.port, req)?; Ok(NoResp::default()) } From f488213d1d408cb7276092cff8dde8abd3392880 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Mon, 19 Jan 2026 11:10:47 +0000 Subject: [PATCH 3/4] Review feedback: module for attached subnet ops --- lib/oxide-vpc/src/engine/attached_subnets.rs | 111 +++++++++++++++++++ lib/oxide-vpc/src/engine/mod.rs | 3 +- lib/oxide-vpc/src/engine/nat.rs | 102 +---------------- lib/oxide-vpc/tests/firewall_tests.rs | 2 +- lib/oxide-vpc/tests/integration_tests.rs | 9 +- xde/src/xde.rs | 15 ++- 6 files changed, 133 insertions(+), 109 deletions(-) create mode 100644 lib/oxide-vpc/src/engine/attached_subnets.rs diff --git a/lib/oxide-vpc/src/engine/attached_subnets.rs b/lib/oxide-vpc/src/engine/attached_subnets.rs new file mode 100644 index 00000000..7253590f --- /dev/null +++ b/lib/oxide-vpc/src/engine/attached_subnets.rs @@ -0,0 +1,111 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +// Copyright 2026 Oxide Computer Company + +use super::VpcNetwork; +use super::overlay::VpcMappings; +use crate::api::AttachSubnetReq; +use crate::api::DetachSubnetReq; +use crate::api::DetachSubnetResp; +use crate::api::InternetGatewayMap; +use crate::cfg::IpCfg; +use alloc::sync::Arc; +use opte::api::IpCidr; +use opte::api::OpteError; +use opte::engine::port::Port; + +pub fn attach_subnet( + port: &Port, + inet_gw_map: Option<&InternetGatewayMap>, + vpc_mappings: &Arc, + req: AttachSubnetReq, +) -> Result<(), OpteError> { + let cfg = &port.network().cfg; + let changed = match (req.cidr, &cfg.ip_cfg) { + (IpCidr::Ip4(v4), IpCfg::Ipv4(v4_cfg)) + | (IpCidr::Ip4(v4), IpCfg::DualStack { ipv4: v4_cfg, .. }) => { + v4_cfg.attached_subnets.update(|map| { + let install = if let Some(val) = map.get(&v4) { + val != &req.cfg + } else { + true + }; + install.then(|| { + let mut out = map.clone(); + out.insert(v4, req.cfg); + out + }) + }) + } + (IpCidr::Ip6(v6), IpCfg::Ipv6(v6_cfg)) + | (IpCidr::Ip6(v6), IpCfg::DualStack { ipv6: v6_cfg, .. }) => { + v6_cfg.attached_subnets.update(|map| { + let install = if let Some(val) = map.get(&v6) { + val != &req.cfg + } else { + true + }; + install.then(|| { + let mut out = map.clone(); + out.insert(v6, req.cfg); + out + }) + }) + } + // Trying to attach a CIDR class which this port cannot use. + _ => return Err(OpteError::InvalidIpCfg), + }; + + if changed { + super::nat::refresh_nat_rules(port, inet_gw_map)?; + super::gateway::set_gateway_rules(port, vpc_mappings.clone())?; + } + + Ok(()) +} + +pub fn detach_subnet( + port: &Port, + inet_gw_map: Option<&InternetGatewayMap>, + vpc_mappings: &Arc, + req: DetachSubnetReq, +) -> Result { + let cfg = &port.network().cfg; + let changed = match (req.cidr, &cfg.ip_cfg) { + (IpCidr::Ip4(v4), IpCfg::Ipv4(v4_cfg)) + | (IpCidr::Ip4(v4), IpCfg::DualStack { ipv4: v4_cfg, .. }) => { + v4_cfg.attached_subnets.update(|map| { + map.contains_key(&v4).then(|| { + let mut out = map.clone(); + out.remove(&v4); + out + }) + }) + } + (IpCidr::Ip6(v6), IpCfg::Ipv6(v6_cfg)) + | (IpCidr::Ip6(v6), IpCfg::DualStack { ipv6: v6_cfg, .. }) => { + v6_cfg.attached_subnets.update(|map| { + map.contains_key(&v6).then(|| { + let mut out = map.clone(); + out.remove(&v6); + out + }) + }) + } + // Trying to attach a CIDR class which this port cannot use. + _ => return Err(OpteError::InvalidIpCfg), + }; + + if changed { + super::nat::refresh_nat_rules(port, inet_gw_map)?; + super::gateway::set_gateway_rules(port, vpc_mappings.clone())?; + } + + Ok(if !changed { + DetachSubnetResp::NotFound + } else { + DetachSubnetResp::Ok(req.cidr) + }) +} diff --git a/lib/oxide-vpc/src/engine/mod.rs b/lib/oxide-vpc/src/engine/mod.rs index ee94cdae..c87044fe 100644 --- a/lib/oxide-vpc/src/engine/mod.rs +++ b/lib/oxide-vpc/src/engine/mod.rs @@ -2,8 +2,9 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -// Copyright 2025 Oxide Computer Company +// Copyright 2026 Oxide Computer Company +pub mod attached_subnets; pub mod firewall; pub mod gateway; pub mod geneve; diff --git a/lib/oxide-vpc/src/engine/nat.rs b/lib/oxide-vpc/src/engine/nat.rs index 473b63ba..9c3ff1ca 100644 --- a/lib/oxide-vpc/src/engine/nat.rs +++ b/lib/oxide-vpc/src/engine/nat.rs @@ -5,14 +5,9 @@ // Copyright 2026 Oxide Computer Company use super::VpcNetwork; -use super::gateway; -use super::overlay::VpcMappings; use super::router::ROUTER_LAYER_NAME; use super::router::RouterTargetClass; use super::router::RouterTargetInternal; -use crate::api::AttachSubnetReq; -use crate::api::DetachSubnetReq; -use crate::api::DetachSubnetResp; use crate::api::ExternalIpCfg; use crate::api::InternetGatewayMap; use crate::api::SetExternalIpsReq; @@ -27,7 +22,6 @@ use alloc::sync::Arc; use alloc::vec::Vec; use core::num::NonZeroU32; use opte::api::IpAddr; -use opte::api::IpCidr; use opte::api::Ipv4Addr; use opte::api::Ipv6Addr; use opte::api::OpteError; @@ -560,101 +554,7 @@ pub fn set_external_ips( refresh_nat_rules(port, req.inet_gw_map.as_ref()) } -pub fn attach_subnet( - port: &Port, - inet_gw_map: Option<&InternetGatewayMap>, - vpc_mappings: &Arc, - req: AttachSubnetReq, -) -> Result<(), OpteError> { - let cfg = &port.network().cfg; - let changed = match (req.cidr, &cfg.ip_cfg) { - (IpCidr::Ip4(v4), IpCfg::Ipv4(v4_cfg)) - | (IpCidr::Ip4(v4), IpCfg::DualStack { ipv4: v4_cfg, .. }) => { - v4_cfg.attached_subnets.update(|map| { - let install = if let Some(val) = map.get(&v4) { - val != &req.cfg - } else { - true - }; - install.then(|| { - let mut out = map.clone(); - out.insert(v4, req.cfg); - out - }) - }) - } - (IpCidr::Ip6(v6), IpCfg::Ipv6(v6_cfg)) - | (IpCidr::Ip6(v6), IpCfg::DualStack { ipv6: v6_cfg, .. }) => { - v6_cfg.attached_subnets.update(|map| { - let install = if let Some(val) = map.get(&v6) { - val != &req.cfg - } else { - true - }; - install.then(|| { - let mut out = map.clone(); - out.insert(v6, req.cfg); - out - }) - }) - } - // Trying to attach a CIDR class which this port cannot use. - _ => return Err(OpteError::InvalidIpCfg), - }; - - if changed { - refresh_nat_rules(port, inet_gw_map)?; - gateway::set_gateway_rules(port, vpc_mappings.clone())?; - } - - Ok(()) -} - -pub fn detach_subnet( - port: &Port, - inet_gw_map: Option<&InternetGatewayMap>, - vpc_mappings: &Arc, - req: DetachSubnetReq, -) -> Result { - let cfg = &port.network().cfg; - let changed = match (req.cidr, &cfg.ip_cfg) { - (IpCidr::Ip4(v4), IpCfg::Ipv4(v4_cfg)) - | (IpCidr::Ip4(v4), IpCfg::DualStack { ipv4: v4_cfg, .. }) => { - v4_cfg.attached_subnets.update(|map| { - map.contains_key(&v4).then(|| { - let mut out = map.clone(); - out.remove(&v4); - out - }) - }) - } - (IpCidr::Ip6(v6), IpCfg::Ipv6(v6_cfg)) - | (IpCidr::Ip6(v6), IpCfg::DualStack { ipv6: v6_cfg, .. }) => { - v6_cfg.attached_subnets.update(|map| { - map.contains_key(&v6).then(|| { - let mut out = map.clone(); - out.remove(&v6); - out - }) - }) - } - // Trying to attach a CIDR class which this port cannot use. - _ => return Err(OpteError::InvalidIpCfg), - }; - - if changed { - refresh_nat_rules(port, inet_gw_map)?; - gateway::set_gateway_rules(port, vpc_mappings.clone())?; - } - - Ok(if !changed { - DetachSubnetResp::NotFound - } else { - DetachSubnetResp::Ok(req.cidr) - }) -} - -fn refresh_nat_rules( +pub(super) fn refresh_nat_rules( port: &Port, inet_gw_map: Option<&InternetGatewayMap>, ) -> Result<(), OpteError> { diff --git a/lib/oxide-vpc/tests/firewall_tests.rs b/lib/oxide-vpc/tests/firewall_tests.rs index 2aeca0f6..fb88df99 100644 --- a/lib/oxide-vpc/tests/firewall_tests.rs +++ b/lib/oxide-vpc/tests/firewall_tests.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -// Copyright 2025 Oxide Computer Company +// Copyright 2026 Oxide Computer Company use opte::ddi::mblk::MsgBlk; use opte_test_utils as common; diff --git a/lib/oxide-vpc/tests/integration_tests.rs b/lib/oxide-vpc/tests/integration_tests.rs index c33039e2..875c6b02 100644 --- a/lib/oxide-vpc/tests/integration_tests.rs +++ b/lib/oxide-vpc/tests/integration_tests.rs @@ -68,6 +68,7 @@ use oxide_vpc::api::ExternalIpCfg; use oxide_vpc::api::FirewallRule; use oxide_vpc::api::RouterClass; use oxide_vpc::api::VpcCfg; +use oxide_vpc::engine::attached_subnets; use oxide_vpc::engine::geneve; use pcap::*; use smoltcp::phy::ChecksumCapabilities as CsumCapab; @@ -4418,7 +4419,7 @@ fn internal_attached_subnets() { // Attach the subnet. let cidr = "10.0.0.0/8".parse().unwrap(); - nat::attach_subnet( + attached_subnets::attach_subnet( &g1.port, None, &g1.vpc_map, @@ -4504,7 +4505,7 @@ fn internal_attached_subnets() { incr!(g1, ["epoch, epoch"]); // ...until we remove the attachment itself. - nat::detach_subnet( + attached_subnets::detach_subnet( &g1.port, None, &g1.vpc_map, @@ -4523,7 +4524,7 @@ fn external_attached_subnets_dont_apply_nat() { set!(g1, "port_state=running"); // Attach the subnet. - nat::attach_subnet( + attached_subnets::attach_subnet( &g1.port, None, &g1.vpc_map, @@ -4630,7 +4631,7 @@ fn external_attached_subnets_cannot_reach_internal() { set!(g1, "port_state=running"); // Attach the subnet. - nat::attach_subnet( + attached_subnets::attach_subnet( &g1.port, None, &g1.vpc_map, diff --git a/xde/src/xde.rs b/xde/src/xde.rs index ab30f063..fe2e0be6 100644 --- a/xde/src/xde.rs +++ b/xde/src/xde.rs @@ -304,6 +304,7 @@ use oxide_vpc::cfg::IpCfg; use oxide_vpc::cfg::VpcCfg; use oxide_vpc::engine::VpcNetwork; use oxide_vpc::engine::VpcParser; +use oxide_vpc::engine::attached_subnets; use oxide_vpc::engine::firewall; use oxide_vpc::engine::gateway; use oxide_vpc::engine::geneve::MssInfoRef; @@ -4270,7 +4271,12 @@ fn attach_subnet_hdlr(env: &mut IoctlEnvelope) -> Result { .ok_or_else(|| OpteError::PortNotFound(req.port_name.clone()))?; let igw_map_lock = dev.port_igw_map.lock(); - nat::attach_subnet(&dev.port, igw_map_lock.as_ref(), &state.vpc_map, req)?; + attached_subnets::attach_subnet( + &dev.port, + igw_map_lock.as_ref(), + &state.vpc_map, + req, + )?; Ok(NoResp::default()) } @@ -4287,7 +4293,12 @@ fn detach_subnet_hdlr( .ok_or_else(|| OpteError::PortNotFound(req.port_name.clone()))?; let igw_map_lock = dev.port_igw_map.lock(); - nat::detach_subnet(&dev.port, igw_map_lock.as_ref(), &state.vpc_map, req) + attached_subnets::detach_subnet( + &dev.port, + igw_map_lock.as_ref(), + &state.vpc_map, + req, + ) } #[unsafe(no_mangle)] From c64712806b2eaa8d65aff47c9c0e99da2c12822b Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Mon, 19 Jan 2026 11:24:01 +0000 Subject: [PATCH 4/4] Review feedback: caveat on future EIP hairpin prevention --- lib/oxide-vpc/src/engine/overlay.rs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/lib/oxide-vpc/src/engine/overlay.rs b/lib/oxide-vpc/src/engine/overlay.rs index 979ddd77..d68fb237 100644 --- a/lib/oxide-vpc/src/engine/overlay.rs +++ b/lib/oxide-vpc/src/engine/overlay.rs @@ -289,7 +289,7 @@ impl StaticAction for EncapAction { // This requires a hairpin through the customer network, but provides // strong isolation which some customers require. // - // In future we want this to be a tunable property of the VPC. In this + // In future we may want this to be a tunable property of the VPC. In this // case we would require an extra table/poptrie per VPC, containing all // external CIDR blocks visible across the VPC. We would then: // * resolve `recipient` against this table when going via an IGW, @@ -298,6 +298,13 @@ impl StaticAction for EncapAction { // the V2B. // * Possibly add the Geneve external packet tag to the packet, esp. if // crossing VPC boundaries. + // This obviously works well for attached subnets, but for EIPs and FIPs + // we'll have quite a few /32 or /128 routing table entries which can't + // be aggregated unless adjacent external IPs point to the same instance + // (and this would probably be harmed further by SNAT allocation causing + // fragmentation). + // + // It's a possible optimisation, but it'd need more thought. RouterTargetInternal::InternetGateway(_) => { match self.v2b.get(&recipient) { Some(phys) if sent_from_eip => {