diff --git a/book/.vitepress/config.mts b/book/.vitepress/config.mts index cd2f03d3dd..3df03d81ad 100644 --- a/book/.vitepress/config.mts +++ b/book/.vitepress/config.mts @@ -52,6 +52,13 @@ export default defineConfig({ { text: 'Troubleshooting', link: 'troubleshooting' }, { text: 'Frequently Asked Questions', link: 'faq' }, ] + }, + { + text: 'Internals', + collapsed: false, + items: [ + { text: 'Netlink', link: 'netlink' }, + ] } ] }, diff --git a/book/api/metrics-generated.md b/book/api/metrics-generated.md index 5e922b7bae..effa9677e9 100644 --- a/book/api/metrics-generated.md +++ b/book/api/metrics-generated.md @@ -462,3 +462,18 @@ | gossip_​gossip_​peer_​counts_​total | `gauge` | Number of gossip peers tracked (Total Peers Detected) | | gossip_​gossip_​peer_​counts_​active | `gauge` | Number of gossip peers tracked (Active) | | gossip_​gossip_​peer_​counts_​inactive | `gauge` | Number of gossip peers tracked (Inactive) | + +## Netlnk Tile +| Metric | Type | Description | +|--------|------|-------------| +| netlnk_​drop_​events | `counter` | Number of netlink drop events caught | +| netlnk_​link_​full_​syncs | `counter` | Number of full link table syncs done | +| netlnk_​route_​full_​syncs | `counter` | Number of full route table syncs done | +| netlnk_​updates_​link | `counter` | Number of netlink live updates processed (Link) | +| netlnk_​updates_​neigh | `counter` | Number of netlink live updates processed (Neighbor Table Entry) | +| netlnk_​updates_​ipv4_​route | `counter` | Number of netlink live updates processed (IPv4 Route Table Entry) | +| netlnk_​interface_​count | `gauge` | Number of network interfaces | +| netlnk_​route_​count_​local | `gauge` | Number of IPv4 routes (Local) | +| netlnk_​route_​count_​main | `gauge` | Number of IPv4 routes (Main) | +| netlnk_​neighbor_​solicits_​sent | `counter` | Number of neighbor solicit requests sent to kernel | +| netlnk_​neighbor_​solicits_​fails | `counter` | Number of neighbor solicit requests that failed to send | diff --git a/book/guide/netlink.md b/book/guide/netlink.md new file mode 100644 index 0000000000..7fd6046850 --- /dev/null +++ b/book/guide/netlink.md @@ -0,0 +1,111 @@ +# Netlink Integration + +## Summary + +Firedancer's userland networking stack sources configuration from netlink +to allow mostly zero config interoperability with Linux. + +This contrasts with other fast networking stacks which typically require +complex network configuration or a dedicated IP address. + +The following describes the netlink integration in detail. + +## Tile Overview + +Firedancer uses XDP for fast networking. This means that some packet +processing steps traditionally done in the kernel (with UDP sockets) now +have to be done in the Firedancer software. Specifically routing and +resolving link-level neighbors. + +The required information in these steps is requested from the kernel via +the [rtnetlink API](https://man7.org/linux/man-pages/man7/rtnetlink.7.html). +Doing all netlink requests in the data path (i.e. in the net tile) bears +security risk and is slow. + +The reasons netlink requests are done in a separate tile are: +- **Improved security architecture.** Firedancer's sandbox isolates the + netlink interface from untrusted user traffic +- **Better performance.** The netlink tile provides shared memory caches + that greatly reduce the amount of netlink requests. + +### "Netbase" shared memory region + +The netlink tile keeps a read-only cache of the following information: + +- Interface table +- IPv4 route tables `local` and `main` +- Neighbor tables (only for XDP-enabled Ethernet interfaces) + +The objects containing the above information are stored in the "netbase" +workspace. (A workspace is a shared memory region) + +### Security + +A netlink tile requires an rtnetlink socket. On startup, it subscribes +to route and neighbor table changes. It will also issue RTM_GETROUTE +and RTM_GETNEIGH requests. On RHEL 8 with a Linux 4.18 kernel, all +netlink interactions (including creation of the socket) can be done from +a regular unprivileged user without capabilities. + +The kernel's netlink interface exposes a large attack surface. +Therefore, this tile attempts to isolate itself from direct untrusted +inputs. + +### Data flows + +- `[net tiles] <-- [netbase]`
+ Net tiles have read only access to the shared memory region backing + the netbase object. A malicious netlink tile can compromise net tiles + by corrupting the netbase object, but not vice versa. + +- `[changes by sysadmin] --> [netlink] --> [netlink tile]`
+ Route table updates are forwarded to the netlink tile. This occurs + rarely (typically if the sysadmin performs manual changes or if due to + a system daemon). + +- `[netlink tile] --> [netbase]`
+ The netlink tile writes neighbor and route table updates to a shared + memory region. + +- `[neighbor discovery] --> [netlink] --> [netlink tile]`
+ Neighbor table updates are forwarded ot the netlink tile. This path + has limited throughput (few ~100K updates per second). + +- `[untrusted traffic] --> [net tile] --> [app tile]`
+ `--> [net tile] --> [netlink tile] --> [neighbor discovery]`
+ App tiles will blindly respond to the source IP found in untrusted + packets. This source IP can be spoofed. Neighbor solicitation might + be required in order to find out the MAC address of that IP. On IPv4, + these are ARP requests broadcasted to the local network. + + Net tiles cannot solicit neighbors directly, so they notify the + netlink tile that neighbor solicitation is needed. (Potentially at + line rate if network configuration is part of a huge subnet) + + The netlink tile will deduplicate these requests and forward them to + the kernel. + + This path is the only direct 'untrusted traffic' -> 'netlink tile' + data flow, so the internal neighbor solicit message format is kept + as simple as possbile for security. + +### Neighbor discovery (ARP) + +A concurrent open addressed hash table is used to store ARP entries +(henceforth called "neighbor table"). This table attempts to +continuously stay in sync with the kernel. + +The netlink tile requests neighbor solicitations via the netlink +equivalent of `ip neigh add dev DEVICE IP use`. + +### Routing + +The Firedancer network stack supports very simple routing tables as +typically seen on cloud instances, servers directly connected to an +Ethernet switch, or a router. + +Only the "local" and "main" routing tables are synchronized. Policy +based routing and additional routing tables are NOT supported. + +Outgoing traffic matching the "local" table is sent to the loopback +device. diff --git a/src/app/fdctl/Local.mk b/src/app/fdctl/Local.mk index 8cef9b5c0b..a47da0fe2b 100644 --- a/src/app/fdctl/Local.mk +++ b/src/app/fdctl/Local.mk @@ -18,6 +18,7 @@ $(OBJDIR)/obj/app/fdctl/version.d: src/app/fdctl/version.h # fdctl core $(call add-objs,main1 config config_parse caps utility keys ready mem spy help version,fd_fdctl) +$(call add-objs,netconf,fd_fdctl) $(call add-objs,run/run run/run1 run/run_agave,fd_fdctl) $(call add-objs,monitor/monitor monitor/helper,fd_fdctl) $(call make-fuzz-test,fuzz_fdctl_config,fuzz_fdctl_config,fd_fdctl fd_ballet fd_util) diff --git a/src/app/fdctl/config.c b/src/app/fdctl/config.c index 083fdbb36c..945c7036ba 100644 --- a/src/app/fdctl/config.c +++ b/src/app/fdctl/config.c @@ -11,6 +11,9 @@ #include "../../flamenco/runtime/fd_blockstore.h" #include "../../flamenco/runtime/fd_txncache.h" #include "../../funk/fd_funk.h" +#include "../../waltz/ip/fd_fib4.h" +#include "../../waltz/mib/fd_dbl_buf.h" +#include "../../waltz/neigh/fd_neigh4_map.h" #include "../../util/net/fd_eth.h" #include "../../util/net/fd_ip4.h" @@ -218,12 +221,22 @@ fdctl_obj_align( fd_topo_t const * topo, return fd_fseq_align(); } else if( FD_UNLIKELY( !strcmp( obj->name, "metrics" ) ) ) { return FD_METRICS_ALIGN; + } else if( FD_UNLIKELY( !strcmp( obj->name, "opaque" ) ) ) { + ulong align = fd_pod_queryf_ulong( topo->props, ULONG_MAX, "obj.%lu.align", obj->id ); + if( FD_UNLIKELY( align==ULONG_MAX ) ) FD_LOG_ERR(( "obj.%lu.align was not set", obj->id )); + return align; + } else if( FD_UNLIKELY( !strcmp( obj->name, "dbl_buf" ) ) ) { + return fd_dbl_buf_align(); } else if( FD_UNLIKELY( !strcmp( obj->name, "blockstore" ) ) ) { return fd_blockstore_align(); } else if( FD_UNLIKELY( !strcmp( obj->name, "funk" ) ) ) { return fd_funk_align(); } else if( FD_UNLIKELY( !strcmp( obj->name, "txncache" ) ) ) { return fd_txncache_align(); + } else if( FD_UNLIKELY( !strcmp( obj->name, "neigh4_hmap" ) ) ) { + return fd_neigh4_hmap_align(); + } else if( FD_UNLIKELY( !strcmp( obj->name, "fib4" ) ) ) { + return fd_fib4_align(); } else { FD_LOG_ERR(( "unknown object `%s`", obj->name )); return 0UL; @@ -259,12 +272,20 @@ fdctl_obj_footprint( fd_topo_t const * topo, return fd_fseq_footprint(); } else if( FD_UNLIKELY( !strcmp( obj->name, "metrics" ) ) ) { return FD_METRICS_FOOTPRINT( VAL("in_cnt"), VAL("cons_cnt") ); + } else if( FD_UNLIKELY( !strcmp( obj->name, "opaque" ) ) ) { + return VAL("footprint"); + } else if( FD_UNLIKELY( !strcmp( obj->name, "dbl_buf" ) ) ) { + return fd_dbl_buf_footprint( VAL("mtu") ); } else if( FD_UNLIKELY( !strcmp( obj->name, "blockstore" ) ) ) { return fd_blockstore_footprint( VAL("shred_max"), VAL("block_max"), VAL("idx_max"), VAL("txn_max") ) + VAL("alloc_max"); } else if( FD_UNLIKELY( !strcmp( obj->name, "funk" ) ) ) { return fd_funk_footprint(); } else if( FD_UNLIKELY( !strcmp( obj->name, "txncache" ) ) ) { return fd_txncache_footprint( VAL("max_rooted_slots"), VAL("max_live_slots"), VAL("max_txn_per_slot"), FD_TXNCACHE_DEFAULT_MAX_CONSTIPATED_SLOTS ); + } else if( FD_UNLIKELY( !strcmp( obj->name, "neigh4_hmap" ) ) ) { + return fd_neigh4_hmap_footprint( VAL("ele_max"), VAL("lock_cnt"), VAL("probe_max") ); + } else if( FD_UNLIKELY( !strcmp( obj->name, "fib4" ) ) ) { + return fd_fib4_footprint( VAL("route_max") ); } else { FD_LOG_ERR(( "unknown object `%s`", obj->name )); return 0UL; @@ -504,34 +525,6 @@ fdctl_cfg_from_env( int * pargc, config->tiles.net.ip_addr = iface_ip; mac_address( config->tiles.net.interface, config->tiles.net.mac_addr ); - /* support for multihomed hosts */ - ulong multi_cnt = config->tiles.net.multihome_ip_addrs_cnt; - for( ulong j = 0; j < multi_cnt; ++j ) { - int success = fd_cstr_to_ip4_addr( config->tiles.net.multihome_ip_addrs[j], - &config->tiles.net.multihome_ip4_addrs[j] ); - if( !success ) { - FD_LOG_ERR(( "configuration option [tiles.net.multihome_ip_addrs] " - "specifies malformed IP address `%s`", - config->tiles.net.multihome_ip_addrs[j] )); - } - } - - /* look for duplicate addresses */ - /* there's only a few, so do the O(n^2) comparison */ - for( ulong j = 0; j < multi_cnt; ++j ) { - if( config->tiles.net.ip_addr == config->tiles.net.multihome_ip4_addrs[j] ) { - FD_LOG_ERR(( "configuration option [tiles.net.multihome_ip_addrs] " - "specifies an address that matches [tiles.net.src_ip_addr]" )); - } - for( ulong k = j+1; k < multi_cnt; ++k ) { - if( config->tiles.net.multihome_ip4_addrs[j] == config->tiles.net.multihome_ip4_addrs[k] ) { - FD_LOG_ERR(( "configuration option [tiles.net.multihome_ip_addrs] " - "specifies duplicate ip addresses `%s`", - config->tiles.net.multihome_ip_addrs[j] )); - } - } - } - } username_to_id( config ); diff --git a/src/app/fdctl/config.h b/src/app/fdctl/config.h index f8718ac960..ae22828a9b 100644 --- a/src/app/fdctl/config.h +++ b/src/app/fdctl/config.h @@ -17,7 +17,7 @@ /* config_t represents all available configuration options that could be set in a user defined configuration toml file. For information about the options, see the `default.toml` file provided. */ -typedef struct { +struct fdctl_config { char name[ NAME_SZ ]; char user[ 256 ]; char hostname[ FD_LOG_NAME_MAX ]; @@ -216,12 +216,13 @@ typedef struct { uint xdp_aio_depth; uint send_buffer_size; - - ulong multihome_ip_addrs_cnt; /* number of home ip addresses */ - char multihome_ip_addrs[FD_NET_MAX_SRC_ADDR][32]; - uint multihome_ip4_addrs[FD_NET_MAX_SRC_ADDR]; } net; + struct { + ulong max_routes; + ulong max_neighbors; + } netlink; + struct { ushort regular_transaction_listen_port; ushort quic_transaction_listen_port; @@ -319,7 +320,9 @@ typedef struct { } batch; } tiles; -} config_t; +}; + +typedef struct fdctl_config config_t; FD_PROTOTYPES_BEGIN diff --git a/src/app/fdctl/config/default.toml b/src/app/fdctl/config/default.toml index 9a738e24fd..53f2b15b8d 100644 --- a/src/app/fdctl/config/default.toml +++ b/src/app/fdctl/config/default.toml @@ -882,12 +882,29 @@ dynamic_port_range = "8900-9000" # this really be configurable? send_buffer_size = 16384 - # The XDP program will filter packets that aren't destined for - # the IPv4 address of the interface bound above, but sometimes a - # validator may advertise multiple IP addresses. In this case - # the additional addresses can be specified here, and packets - # addressed to them will be accepted. - multihome_ip_addrs = [] + # The netlink tile forwards Linux network configuration to net tiles. + # This config section contains advanced options that typically do not + # need to be changed. + # For further info, see https://docs.firedancer.io/guide/netlink.html + [tiles.netlink] + # The maximum number of routes per route table. + # + # The netlink tile imports two route tables from Linux, namely + # `local` and `main`. You can view them by running + # `ip route show table main`. Decreasing this option can result + # in connectivity issues. Increasing this option can drastically + # decrease performance. + # + # For virtually all cloud and bare-metal server providers, the + # number of routes per table does not exceed 16. + max_routes = 128 + + # The maximum number of Ethernet neighbors. + # + # This should be roughly as large as the size your Ethernet subnet. + # E.g. if your IP address is 198.51.100.3/24, then your subnet has + # up to 256 neighbors (2^(32-24)). + max_neighbors = 4096 # QUIC tiles are responsible for serving network traffic, including # parsing and responding to packets and managing connection timeouts diff --git a/src/app/fdctl/config_parse.c b/src/app/fdctl/config_parse.c index f6eb07f4c8..9a7639f85a 100644 --- a/src/app/fdctl/config_parse.c +++ b/src/app/fdctl/config_parse.c @@ -290,7 +290,9 @@ fdctl_pod_to_cfg( config_t * config, CFG_POP ( uint, tiles.net.xdp_tx_queue_size ); CFG_POP ( uint, tiles.net.xdp_aio_depth ); CFG_POP ( uint, tiles.net.send_buffer_size ); - CFG_POP_ARRAY( cstr, tiles.net.multihome_ip_addrs ); + + CFG_POP ( ulong, tiles.netlink.max_routes ); + CFG_POP ( ulong, tiles.netlink.max_neighbors ); CFG_POP ( ushort, tiles.quic.regular_transaction_listen_port ); CFG_POP ( ushort, tiles.quic.quic_transaction_listen_port ); @@ -461,6 +463,9 @@ fdctl_cfg_validate( config_t * cfg ) { CFG_HAS_NON_ZERO ( tiles.net.xdp_aio_depth ); CFG_HAS_NON_ZERO ( tiles.net.send_buffer_size ); + CFG_HAS_NON_ZERO( tiles.netlink.max_routes ); + CFG_HAS_NON_ZERO( tiles.netlink.max_neighbors ); + CFG_HAS_NON_ZERO( tiles.quic.regular_transaction_listen_port ); CFG_HAS_NON_ZERO( tiles.quic.quic_transaction_listen_port ); CFG_HAS_NON_ZERO( tiles.quic.max_concurrent_connections ); diff --git a/src/app/fdctl/fdctl.h b/src/app/fdctl/fdctl.h index 56ef0724b5..2079cb5b1a 100644 --- a/src/app/fdctl/fdctl.h +++ b/src/app/fdctl/fdctl.h @@ -123,8 +123,7 @@ fdctl_obj_loose( fd_topo_t const * topo, fd_topo_run_tile_t fdctl_tile_run( fd_topo_tile_t * tile ); -#define ACTIONS_CNT (11UL) -extern action_t ACTIONS[ ACTIONS_CNT ]; +extern action_t ACTIONS[]; void fdctl_boot( int * pargc, char *** pargv, @@ -207,6 +206,10 @@ void spy_cmd_fn( args_t * args, config_t * const config ); +void +netconf_cmd_fn( args_t * args, + config_t * config ); + void help_cmd_fn( args_t * args, config_t * const config ); diff --git a/src/app/fdctl/help.c b/src/app/fdctl/help.c index f795b1da83..7c1735e874 100644 --- a/src/app/fdctl/help.c +++ b/src/app/fdctl/help.c @@ -13,7 +13,7 @@ help_cmd_fn( args_t * args, --config parameter. */ FD_LOG_STDOUT(( " --config Path to config TOML file\n\n" )); FD_LOG_STDOUT(( "SUBCOMMANDS:\n" )); - for( ulong i=0; i #include -action_t ACTIONS[ ACTIONS_CNT ] = { +action_t ACTIONS[] = { { .name = "run", .args = NULL, .fn = run_cmd_fn, .perm = run_cmd_perm, .description = "Start up a Firedancer validator" }, { .name = "run1", .args = run1_cmd_args, .fn = run1_cmd_fn, .perm = NULL, .description = "Start up a single Firedancer tile" }, { .name = "run-agave", .args = NULL, .fn = run_agave_cmd_fn, .perm = NULL, .description = "Start up the Agave side of a Firedancer validator" }, @@ -16,8 +16,10 @@ action_t ACTIONS[ ACTIONS_CNT ] = { { .name = "ready", .args = NULL, .fn = ready_cmd_fn, .perm = NULL, .description = "Wait for all tiles to be running" }, { .name = "mem", .args = NULL, .fn = mem_cmd_fn, .perm = NULL, .description = "Print workspace memory and tile topology information" }, { .name = "spy", .args = NULL, .fn = spy_cmd_fn, .perm = NULL, .description = "Spy on and print out gossip traffic" }, + { .name = "netconf", .args = NULL, .fn = netconf_cmd_fn, .perm = NULL, .description = "Print network configuration" }, { .name = "help", .args = NULL, .fn = help_cmd_fn, .perm = NULL, .description = "Print this help message" }, { .name = "version", .args = NULL, .fn = version_cmd_fn, .perm = NULL, .description = "Show the current software version" }, + {0} }; struct action_alias { @@ -278,7 +280,7 @@ main1( int argc, } action_t * action = NULL; - for( ulong i=0; i +#include + +void +netconf_cmd_fn( args_t * args, + config_t * config ) { + (void)args; + + fd_topo_t * topo = &config->topo; + ulong wksp_id = fd_topo_find_wksp( topo, "netbase" ); + if( FD_UNLIKELY( wksp_id==ULONG_MAX ) ) { + FD_LOG_ERR(( "netbase workspace not found" )); + } + fd_topo_wksp_t * netbase = &topo->workspaces[ wksp_id ]; + + ulong tile_id = fd_topo_find_tile( topo, "netlnk", 0UL ); + if( FD_UNLIKELY( tile_id==ULONG_MAX ) ) { + FD_LOG_ERR(( "netlnk tile not found" )); + } + fd_topo_tile_t * tile = &topo->tiles[ tile_id ]; + + fd_topo_join_workspace( topo, netbase, FD_SHMEM_JOIN_MODE_READ_ONLY ); + + puts( "\nINTERFACES\n" ); + fd_dbl_buf_t * netdev_buf = fd_dbl_buf_join( fd_topo_obj_laddr( topo, tile->netlink.netdev_dbl_buf_obj_id ) ); + FD_TEST( netdev_buf ); + void * netdev_copy = aligned_alloc( fd_netdev_tbl_align(), fd_dbl_buf_obj_mtu( netdev_buf ) ); + fd_dbl_buf_read( netdev_buf, netdev_copy, NULL ); + fd_netdev_tbl_join_t netdev[1]; + FD_TEST( fd_netdev_tbl_join( netdev, netdev_copy ) ); + fd_netdev_tbl_fprintf( netdev, stdout ); + fd_netdev_tbl_leave( netdev ); + free( netdev_copy ); + fd_dbl_buf_leave( netdev_buf ); + + puts( "\nIPv4 ROUTES (main)\n" ); + fd_fib4_t * fib4_main = fd_fib4_join( fd_topo_obj_laddr( topo, tile->netlink.fib4_main_obj_id ) ); + FD_TEST( fib4_main ); + fd_fib4_fprintf( fib4_main, stdout ); + fd_fib4_leave( fib4_main ); + + puts( "\nIPv4 ROUTES (local)\n" ); + fd_fib4_t * fib4_local = fd_fib4_join( fd_topo_obj_laddr( topo, tile->netlink.fib4_local_obj_id ) ); + FD_TEST( fib4_local ); + fd_fib4_fprintf( fib4_local, stdout ); + fd_fib4_leave( fib4_local ); + + char if_name[ IF_NAMESIZE ] = "???"; + if( FD_UNLIKELY( !if_indextoname( tile->netlink.neigh_if_idx, if_name ) ) ) { + memcpy( if_name, "???", 4 ); + } + printf( "\nNEIGHBOR TABLE (%u-%s)\n\n", tile->netlink.neigh_if_idx, if_name ); + fd_neigh4_hmap_t neigh4[1]; + FD_TEST( fd_neigh4_hmap_join( neigh4, fd_topo_obj_laddr( topo, tile->netlink.neigh4_obj_id ), fd_topo_obj_laddr( topo, tile->netlink.neigh4_ele_obj_id ) ) ); + fd_neigh4_hmap_fprintf( neigh4, stdout ); + fd_neigh4_hmap_leave( neigh4 ); + + puts( "" ); +} diff --git a/src/app/fdctl/run/run.c b/src/app/fdctl/run/run.c index 98f4831de7..f00522aff7 100644 --- a/src/app/fdctl/run/run.c +++ b/src/app/fdctl/run/run.c @@ -15,6 +15,9 @@ #include "../../../flamenco/runtime/fd_txncache.h" #include "../../../funk/fd_funk_filemap.h" #include "../../../funk/fd_funk.h" +#include "../../../waltz/ip/fd_fib4.h" +#include "../../../waltz/mib/fd_dbl_buf.h" +#include "../../../waltz/neigh/fd_neigh4_map.h" #include "../configure/configure.h" #include @@ -538,14 +541,22 @@ fdctl_obj_new( fd_topo_t const * topo, FD_TEST( fd_fseq_new( laddr, ULONG_MAX ) ); } else if( FD_UNLIKELY( !strcmp( obj->name, "metrics" ) ) ) { FD_TEST( fd_metrics_new( laddr, VAL("in_cnt"), VAL("cons_cnt") ) ); + } else if( FD_UNLIKELY( !strcmp( obj->name, "opaque" ) ) ) { + fd_memset( laddr, 0, VAL("footprint") ); } else if( FD_UNLIKELY( !strcmp( obj->name, "ulong" ) ) ) { *(ulong*)laddr = 0; + } else if( FD_UNLIKELY( !strcmp( obj->name, "dbl_buf" ) ) ) { + FD_TEST( fd_dbl_buf_new( laddr, VAL("mtu"), 1UL ) ); } else if( FD_UNLIKELY( !strcmp( obj->name, "blockstore" ) ) ) { FD_TEST( fd_blockstore_new( laddr, VAL("wksp_tag"), VAL("seed"), VAL("shred_max"), VAL("block_max"), VAL("idx_max"), VAL("txn_max") ) ); } else if( FD_UNLIKELY( !strcmp( obj->name, "funk" ) ) ) { FD_TEST( fd_funk_new( laddr, VAL("wksp_tag"), VAL("seed"), VAL("txn_max"), VAL("rec_max") ) ); } else if( FD_UNLIKELY( !strcmp( obj->name, "txncache" ) ) ) { FD_TEST( fd_txncache_new( laddr, VAL("max_rooted_slots"), VAL("max_live_slots"), VAL("max_txn_per_slot"), FD_TXNCACHE_DEFAULT_MAX_CONSTIPATED_SLOTS ) ); + } else if( FD_UNLIKELY( !strcmp( obj->name, "neigh4_hmap" ) ) ) { + FD_TEST( fd_neigh4_hmap_new( laddr, VAL("ele_max"), VAL("lock_cnt"), VAL("probe_max"), VAL("seed") ) ); + } else if( FD_UNLIKELY( !strcmp( obj->name, "fib4" ) ) ) { + FD_TEST( fd_fib4_new( laddr, VAL("route_max") ) ); } else { FD_LOG_ERR(( "unknown object `%s`", obj->name )); } diff --git a/src/app/fdctl/run/topos/fd_firedancer.c b/src/app/fdctl/run/topos/fd_firedancer.c index b01b0a3e6e..185a79fedb 100644 --- a/src/app/fdctl/run/topos/fd_firedancer.c +++ b/src/app/fdctl/run/topos/fd_firedancer.c @@ -8,6 +8,7 @@ #include "../../../../disco/tiles.h" #include "../../../../disco/topo/fd_topob.h" #include "../../../../disco/topo/fd_pod_format.h" +#include "../../../../disco/netlink/fd_netlink_tile.h" /* fd_netlink_topo_create */ #include "../../../../flamenco/runtime/fd_blockstore.h" #include "../../../../flamenco/runtime/fd_runtime.h" #include "../../../../flamenco/runtime/fd_txncache.h" @@ -78,6 +79,7 @@ fd_topo_initialize( config_t * config ) { fd_topo_t * topo = { fd_topob_new( &config->topo, config->name ) }; /* topo, name */ + fd_topob_wksp( topo, "netbase" ); fd_topob_wksp( topo, "net_shred" ); fd_topob_wksp( topo, "net_gossip" ); fd_topob_wksp( topo, "net_repair" ); @@ -131,6 +133,7 @@ fd_topo_initialize( config_t * config ) { fd_topob_wksp( topo, "batch_replay" ); fd_topob_wksp( topo, "net" ); + fd_topob_wksp( topo, "netlink" ); fd_topob_wksp( topo, "quic" ); fd_topob_wksp( topo, "verify" ); fd_topob_wksp( topo, "dedup" ); @@ -239,6 +242,7 @@ fd_topo_initialize( config_t * config ) { /* topo, tile_name, tile_wksp, metrics_wksp, cpu_idx, is_agave */ FOR(net_tile_cnt) fd_topob_tile( topo, "net", "net", "metric_in", tile_to_cpu[ topo->tile_cnt ], 0 ); + fd_topo_tile_t * netlink_tile = fd_topob_tile( topo, "netlnk" , "netlink", "metric_in", ULONG_MAX, 0 ); FOR(quic_tile_cnt) fd_topob_tile( topo, "quic", "quic", "metric_in", tile_to_cpu[ topo->tile_cnt ], 0 ); FOR(verify_tile_cnt) fd_topob_tile( topo, "verify", "verify", "metric_in", tile_to_cpu[ topo->tile_cnt ], 0 ); /**/ fd_topob_tile( topo, "dedup", "dedup", "metric_in", tile_to_cpu[ topo->tile_cnt ], 0 ); @@ -346,6 +350,13 @@ fd_topo_initialize( config_t * config ) { topo->tile_cnt, affinity_tile_cnt )); } + /* The netlink tile shares various objects to net tiles */ + fd_netlink_topo_create( netlink_tile, topo, config ); + for( ulong i=0UL; itiles[ net_tile_id ] ); + } + /* topo, tile_name, tile_kind_id, fseq_wksp, link_name, link_kind_id, reliable, polled */ FOR(net_tile_cnt) for( ulong j=0UL; jnet.repair_intake_listen_port = config->tiles.repair.repair_intake_listen_port; tile->net.repair_serve_listen_port = config->tiles.repair.repair_serve_listen_port; - /* multihome support */ - ulong multi_cnt = tile->net.multihome_ip_addrs_cnt = config->tiles.net.multihome_ip_addrs_cnt; - for( ulong j = 0; j < multi_cnt; ++j ) { - tile->net.multihome_ip_addrs[j] = config->tiles.net.multihome_ip4_addrs[j]; - } + } else if( FD_UNLIKELY( !strcmp( tile->name, "netlnk" ) ) ) { + + /* already configured */ } else if( FD_UNLIKELY( !strcmp( tile->name, "quic" ) ) ) { fd_memcpy( tile->quic.src_mac_addr, config->tiles.net.mac_addr, 6 ); diff --git a/src/app/fdctl/run/topos/fd_frankendancer.c b/src/app/fdctl/run/topos/fd_frankendancer.c index c6cce58f2e..081592f9d3 100644 --- a/src/app/fdctl/run/topos/fd_frankendancer.c +++ b/src/app/fdctl/run/topos/fd_frankendancer.c @@ -4,6 +4,7 @@ #include "../../../../disco/tiles.h" #include "../../../../disco/topo/fd_topob.h" #include "../../../../disco/topo/fd_pod_format.h" +#include "../../../../disco/netlink/fd_netlink_tile.h" /* fd_netlink_topo_create */ #include "../../../../util/tile/fd_tile_private.h" #include "../../../../util/shmem/fd_shmem_private.h" @@ -19,6 +20,7 @@ fd_topo_initialize( config_t * config ) { fd_topo_t * topo = { fd_topob_new( &config->topo, config->name ) }; /* topo, name */ + fd_topob_wksp( topo, "netbase" ); fd_topob_wksp( topo, "net_quic" ); fd_topob_wksp( topo, "net_shred" ); fd_topob_wksp( topo, "quic_verify" ); @@ -38,6 +40,7 @@ fd_topo_initialize( config_t * config ) { fd_topob_wksp( topo, "sign_shred" ); fd_topob_wksp( topo, "net" ); + fd_topob_wksp( topo, "netlink" ); fd_topob_wksp( topo, "quic" ); fd_topob_wksp( topo, "verify" ); fd_topob_wksp( topo, "dedup" ); @@ -106,6 +109,8 @@ fd_topo_initialize( config_t * config ) { /* topo, tile_name, tile_wksp, metrics_wksp, cpu_idx, is_agave */ FOR(net_tile_cnt) fd_topob_tile( topo, "net", "net", "metric_in", tile_to_cpu[ topo->tile_cnt ], 0 ); + fd_topo_tile_t * netlink_tile = + /**/ fd_topob_tile( topo, "netlnk" , "netlink", "metric_in", ULONG_MAX, 0 ); FOR(quic_tile_cnt) fd_topob_tile( topo, "quic", "quic", "metric_in", tile_to_cpu[ topo->tile_cnt ], 0 ); FOR(verify_tile_cnt) fd_topob_tile( topo, "verify", "verify", "metric_in", tile_to_cpu[ topo->tile_cnt ], 0 ); /**/ fd_topob_tile( topo, "dedup", "dedup", "metric_in", tile_to_cpu[ topo->tile_cnt ], 0 ); @@ -157,6 +162,13 @@ fd_topo_initialize( config_t * config ) { } } + /* The netlink tile shares various objects to net tiles */ + fd_netlink_topo_create( netlink_tile, topo, config ); + for( ulong i=0UL; itiles[ net_tile_id ] ); + } + /* topo, tile_name, tile_kind_id, fseq_wksp, link_name, link_kind_id, reliable, polled */ FOR(net_tile_cnt) for( ulong j=0UL; jnet.quic_transaction_listen_port = config->tiles.quic.quic_transaction_listen_port; tile->net.legacy_transaction_listen_port = config->tiles.quic.regular_transaction_listen_port; - /* multihome support */ - ulong multi_cnt = tile->net.multihome_ip_addrs_cnt = config->tiles.net.multihome_ip_addrs_cnt; - for( ulong j = 0; j < multi_cnt; ++j ) { - tile->net.multihome_ip_addrs[j] = config->tiles.net.multihome_ip4_addrs[j]; - } + } else if( FD_UNLIKELY( !strcmp( tile->name, "netlnk" ) ) ) { + + /* already configured */ + } else if( FD_UNLIKELY( !strcmp( tile->name, "quic" ) ) ) { fd_memcpy( tile->quic.src_mac_addr, config->tiles.net.mac_addr, 6 ); diff --git a/src/app/fddev/main1.c b/src/app/fddev/main1.c index b6befd47c9..4a537eb304 100644 --- a/src/app/fddev/main1.c +++ b/src/app/fddev/main1.c @@ -34,6 +34,7 @@ configure_stage_t * STAGES[ CONFIGURE_STAGE_COUNT ] = { }; extern fd_topo_run_tile_t fd_tile_net; +extern fd_topo_run_tile_t fd_tile_netlink; extern fd_topo_run_tile_t fd_tile_quic; extern fd_topo_run_tile_t fd_tile_verify; extern fd_topo_run_tile_t fd_tile_dedup; @@ -69,6 +70,7 @@ extern fd_topo_run_tile_t fd_tile_rpcserv; fd_topo_run_tile_t * TILES[] = { &fd_tile_net, + &fd_tile_netlink, &fd_tile_quic, &fd_tile_verify, &fd_tile_dedup, @@ -187,13 +189,13 @@ fddev_main( int argc, } action_t * action = NULL; - for( ulong i=0; i int: return 8 - + def count(self) -> int: return 1 @@ -85,7 +86,7 @@ def __init__(self, name: str, tile: Optional[Tile], description: str, clickhouse def footprint(self) -> int: return 8 * len(self.enum.values) - + def count(self) -> int: return len(self.enum.values) @@ -97,7 +98,7 @@ def __init__(self, name: str, tile: Optional[Tile], description: str, clickhouse def footprint(self) -> int: return 8 * len(self.enum.values) - + def count(self) -> int: return len(self.enum.values) @@ -140,7 +141,7 @@ def layout(self): def parse_metric(tile: Optional[Tile], metric: ET.Element, enums: Dict[str, MetricEnum]) -> Metric: name = metric.attrib['name'] description = "" - + summary_ele = metric.find('summary') if summary_ele is not None and summary_ele.text is not None: description = summary_ele.text @@ -201,7 +202,7 @@ def parse_metrics(xml_data: str) -> Metrics: Tile[tile.attrib['name'].upper()]: [ parse_metric(Tile[tile.attrib['name'].upper()], metric, enums) for metric in tile - ] + ] for tile in root.findall('tile') } @@ -212,5 +213,5 @@ def parse_metrics(xml_data: str) -> Metrics: link_out = root.find('linkout') assert link_out is not None link_out = [parse_metric(None, metric, enums) for metric in link_out] - + return Metrics(common=common, tiles=tiles, link_in=link_in, link_out=link_out, enums=enums) \ No newline at end of file diff --git a/src/disco/metrics/generated/Local.mk b/src/disco/metrics/generated/Local.mk index b497a7ac80..e3382537fc 100644 --- a/src/disco/metrics/generated/Local.mk +++ b/src/disco/metrics/generated/Local.mk @@ -1,2 +1,2 @@ -$(call add-hdrs,fd_metrics_all.h fd_metrics_quic.h) -$(call add-objs,fd_metrics_all fd_metrics_net fd_metrics_quic fd_metrics_verify fd_metrics_dedup fd_metrics_resolv fd_metrics_pack fd_metrics_bank fd_metrics_poh fd_metrics_store fd_metrics_shred fd_metrics_replay fd_metrics_storei fd_metrics_gossip,fd_disco) +$(call add-hdrs,$(notdir $(wildcard $(MKPATH)/*.h))) +$(call add-objs,$(patsubst %.c,%,$(notdir $(wildcard $(MKPATH)/*.c))),fd_disco) diff --git a/src/disco/metrics/generated/fd_metrics_all.c b/src/disco/metrics/generated/fd_metrics_all.c index bc01aeb5d2..e6f4da236e 100644 --- a/src/disco/metrics/generated/fd_metrics_all.c +++ b/src/disco/metrics/generated/fd_metrics_all.c @@ -49,6 +49,7 @@ const char * FD_METRICS_TILE_KIND_NAMES[FD_METRICS_TILE_KIND_CNT] = { "replay", "storei", "gossip", + "netlnk", }; const ulong FD_METRICS_TILE_KIND_SIZES[FD_METRICS_TILE_KIND_CNT] = { @@ -65,6 +66,7 @@ const ulong FD_METRICS_TILE_KIND_SIZES[FD_METRICS_TILE_KIND_CNT] = { FD_METRICS_REPLAY_TOTAL, FD_METRICS_STOREI_TOTAL, FD_METRICS_GOSSIP_TOTAL, + FD_METRICS_NETLNK_TOTAL, }; const fd_metrics_meta_t * FD_METRICS_TILE_KIND_METRICS[FD_METRICS_TILE_KIND_CNT] = { FD_METRICS_NET, @@ -80,4 +82,5 @@ const fd_metrics_meta_t * FD_METRICS_TILE_KIND_METRICS[FD_METRICS_TILE_KIND_CNT] FD_METRICS_REPLAY, FD_METRICS_STOREI, FD_METRICS_GOSSIP, + FD_METRICS_NETLNK, }; diff --git a/src/disco/metrics/generated/fd_metrics_all.h b/src/disco/metrics/generated/fd_metrics_all.h index 6a56847481..5cab0b3bd0 100644 --- a/src/disco/metrics/generated/fd_metrics_all.h +++ b/src/disco/metrics/generated/fd_metrics_all.h @@ -15,6 +15,7 @@ #include "fd_metrics_replay.h" #include "fd_metrics_storei.h" #include "fd_metrics_gossip.h" +#include "fd_metrics_netlnk.h" /* Start of LINK OUT metrics */ #define FD_METRICS_COUNTER_LINK_SLOW_COUNT_OFF (0UL) @@ -151,7 +152,7 @@ extern const fd_metrics_meta_t FD_METRICS_ALL_LINK_OUT[FD_METRICS_ALL_LINK_OUT_T #define FD_METRICS_TOTAL_SZ (8UL*222UL) -#define FD_METRICS_TILE_KIND_CNT 13 +#define FD_METRICS_TILE_KIND_CNT 14 extern const char * FD_METRICS_TILE_KIND_NAMES[FD_METRICS_TILE_KIND_CNT]; extern const ulong FD_METRICS_TILE_KIND_SIZES[FD_METRICS_TILE_KIND_CNT]; extern const fd_metrics_meta_t * FD_METRICS_TILE_KIND_METRICS[FD_METRICS_TILE_KIND_CNT]; diff --git a/src/disco/metrics/generated/fd_metrics_enums.h b/src/disco/metrics/generated/fd_metrics_enums.h index 48d935e817..b2ca8f9264 100644 --- a/src/disco/metrics/generated/fd_metrics_enums.h +++ b/src/disco/metrics/generated/fd_metrics_enums.h @@ -466,3 +466,17 @@ #define FD_METRICS_ENUM_MAKE_PRUNE_EVENT_V_ENCODING_FAILED_IDX 2 #define FD_METRICS_ENUM_MAKE_PRUNE_EVENT_V_ENCODING_FAILED_NAME "encoding_failed" +#define FD_METRICS_ENUM_NETLINK_MSG_NAME "netlink_msg" +#define FD_METRICS_ENUM_NETLINK_MSG_V_LINK_IDX 0 +#define FD_METRICS_ENUM_NETLINK_MSG_V_LINK_NAME "link" +#define FD_METRICS_ENUM_NETLINK_MSG_V_NEIGH_IDX 1 +#define FD_METRICS_ENUM_NETLINK_MSG_V_NEIGH_NAME "neigh" +#define FD_METRICS_ENUM_NETLINK_MSG_V_IPV4_ROUTE_IDX 2 +#define FD_METRICS_ENUM_NETLINK_MSG_V_IPV4_ROUTE_NAME "ipv4_route" + +#define FD_METRICS_ENUM_ROUTE_TABLE_NAME "route_table" +#define FD_METRICS_ENUM_ROUTE_TABLE_V_LOCAL_IDX 0 +#define FD_METRICS_ENUM_ROUTE_TABLE_V_LOCAL_NAME "local" +#define FD_METRICS_ENUM_ROUTE_TABLE_V_MAIN_IDX 1 +#define FD_METRICS_ENUM_ROUTE_TABLE_V_MAIN_NAME "main" + diff --git a/src/disco/metrics/generated/fd_metrics_netlnk.c b/src/disco/metrics/generated/fd_metrics_netlnk.c new file mode 100644 index 0000000000..a401144e9f --- /dev/null +++ b/src/disco/metrics/generated/fd_metrics_netlnk.c @@ -0,0 +1,16 @@ +/* THIS FILE IS GENERATED BY gen_metrics.py. DO NOT HAND EDIT. */ +#include "fd_metrics_netlnk.h" + +const fd_metrics_meta_t FD_METRICS_NETLNK[FD_METRICS_NETLNK_TOTAL] = { + DECLARE_METRIC( NETLNK_DROP_EVENTS, COUNTER ), + DECLARE_METRIC( NETLNK_LINK_FULL_SYNCS, COUNTER ), + DECLARE_METRIC( NETLNK_ROUTE_FULL_SYNCS, COUNTER ), + DECLARE_METRIC_ENUM( NETLNK_UPDATES, COUNTER, NETLINK_MSG, LINK ), + DECLARE_METRIC_ENUM( NETLNK_UPDATES, COUNTER, NETLINK_MSG, NEIGH ), + DECLARE_METRIC_ENUM( NETLNK_UPDATES, COUNTER, NETLINK_MSG, IPV4_ROUTE ), + DECLARE_METRIC( NETLNK_INTERFACE_COUNT, GAUGE ), + DECLARE_METRIC_ENUM( NETLNK_ROUTE_COUNT, GAUGE, ROUTE_TABLE, LOCAL ), + DECLARE_METRIC_ENUM( NETLNK_ROUTE_COUNT, GAUGE, ROUTE_TABLE, MAIN ), + DECLARE_METRIC( NETLNK_NEIGHBOR_SOLICITS_SENT, COUNTER ), + DECLARE_METRIC( NETLNK_NEIGHBOR_SOLICITS_FAILS, COUNTER ), +}; diff --git a/src/disco/metrics/generated/fd_metrics_netlnk.h b/src/disco/metrics/generated/fd_metrics_netlnk.h new file mode 100644 index 0000000000..6eda56d6f1 --- /dev/null +++ b/src/disco/metrics/generated/fd_metrics_netlnk.h @@ -0,0 +1,64 @@ +/* THIS FILE IS GENERATED BY gen_metrics.py. DO NOT HAND EDIT. */ + +#include "../fd_metrics_base.h" +#include "fd_metrics_enums.h" + +#define FD_METRICS_COUNTER_NETLNK_DROP_EVENTS_OFF (16UL) +#define FD_METRICS_COUNTER_NETLNK_DROP_EVENTS_NAME "netlnk_drop_events" +#define FD_METRICS_COUNTER_NETLNK_DROP_EVENTS_TYPE (FD_METRICS_TYPE_COUNTER) +#define FD_METRICS_COUNTER_NETLNK_DROP_EVENTS_DESC "Number of netlink drop events caught" +#define FD_METRICS_COUNTER_NETLNK_DROP_EVENTS_CVT (FD_METRICS_CONVERTER_NONE) + +#define FD_METRICS_COUNTER_NETLNK_LINK_FULL_SYNCS_OFF (17UL) +#define FD_METRICS_COUNTER_NETLNK_LINK_FULL_SYNCS_NAME "netlnk_link_full_syncs" +#define FD_METRICS_COUNTER_NETLNK_LINK_FULL_SYNCS_TYPE (FD_METRICS_TYPE_COUNTER) +#define FD_METRICS_COUNTER_NETLNK_LINK_FULL_SYNCS_DESC "Number of full link table syncs done" +#define FD_METRICS_COUNTER_NETLNK_LINK_FULL_SYNCS_CVT (FD_METRICS_CONVERTER_NONE) + +#define FD_METRICS_COUNTER_NETLNK_ROUTE_FULL_SYNCS_OFF (18UL) +#define FD_METRICS_COUNTER_NETLNK_ROUTE_FULL_SYNCS_NAME "netlnk_route_full_syncs" +#define FD_METRICS_COUNTER_NETLNK_ROUTE_FULL_SYNCS_TYPE (FD_METRICS_TYPE_COUNTER) +#define FD_METRICS_COUNTER_NETLNK_ROUTE_FULL_SYNCS_DESC "Number of full route table syncs done" +#define FD_METRICS_COUNTER_NETLNK_ROUTE_FULL_SYNCS_CVT (FD_METRICS_CONVERTER_NONE) + +#define FD_METRICS_COUNTER_NETLNK_UPDATES_OFF (19UL) +#define FD_METRICS_COUNTER_NETLNK_UPDATES_NAME "netlnk_updates" +#define FD_METRICS_COUNTER_NETLNK_UPDATES_TYPE (FD_METRICS_TYPE_COUNTER) +#define FD_METRICS_COUNTER_NETLNK_UPDATES_DESC "Number of netlink live updates processed" +#define FD_METRICS_COUNTER_NETLNK_UPDATES_CVT (FD_METRICS_CONVERTER_NONE) +#define FD_METRICS_COUNTER_NETLNK_UPDATES_CNT (3UL) + +#define FD_METRICS_COUNTER_NETLNK_UPDATES_LINK_OFF (19UL) +#define FD_METRICS_COUNTER_NETLNK_UPDATES_NEIGH_OFF (20UL) +#define FD_METRICS_COUNTER_NETLNK_UPDATES_IPV4_ROUTE_OFF (21UL) + +#define FD_METRICS_GAUGE_NETLNK_INTERFACE_COUNT_OFF (22UL) +#define FD_METRICS_GAUGE_NETLNK_INTERFACE_COUNT_NAME "netlnk_interface_count" +#define FD_METRICS_GAUGE_NETLNK_INTERFACE_COUNT_TYPE (FD_METRICS_TYPE_GAUGE) +#define FD_METRICS_GAUGE_NETLNK_INTERFACE_COUNT_DESC "Number of network interfaces" +#define FD_METRICS_GAUGE_NETLNK_INTERFACE_COUNT_CVT (FD_METRICS_CONVERTER_NONE) + +#define FD_METRICS_GAUGE_NETLNK_ROUTE_COUNT_OFF (23UL) +#define FD_METRICS_GAUGE_NETLNK_ROUTE_COUNT_NAME "netlnk_route_count" +#define FD_METRICS_GAUGE_NETLNK_ROUTE_COUNT_TYPE (FD_METRICS_TYPE_GAUGE) +#define FD_METRICS_GAUGE_NETLNK_ROUTE_COUNT_DESC "Number of IPv4 routes" +#define FD_METRICS_GAUGE_NETLNK_ROUTE_COUNT_CVT (FD_METRICS_CONVERTER_NONE) +#define FD_METRICS_GAUGE_NETLNK_ROUTE_COUNT_CNT (2UL) + +#define FD_METRICS_GAUGE_NETLNK_ROUTE_COUNT_LOCAL_OFF (23UL) +#define FD_METRICS_GAUGE_NETLNK_ROUTE_COUNT_MAIN_OFF (24UL) + +#define FD_METRICS_COUNTER_NETLNK_NEIGHBOR_SOLICITS_SENT_OFF (25UL) +#define FD_METRICS_COUNTER_NETLNK_NEIGHBOR_SOLICITS_SENT_NAME "netlnk_neighbor_solicits_sent" +#define FD_METRICS_COUNTER_NETLNK_NEIGHBOR_SOLICITS_SENT_TYPE (FD_METRICS_TYPE_COUNTER) +#define FD_METRICS_COUNTER_NETLNK_NEIGHBOR_SOLICITS_SENT_DESC "Number of neighbor solicit requests sent to kernel" +#define FD_METRICS_COUNTER_NETLNK_NEIGHBOR_SOLICITS_SENT_CVT (FD_METRICS_CONVERTER_NONE) + +#define FD_METRICS_COUNTER_NETLNK_NEIGHBOR_SOLICITS_FAILS_OFF (26UL) +#define FD_METRICS_COUNTER_NETLNK_NEIGHBOR_SOLICITS_FAILS_NAME "netlnk_neighbor_solicits_fails" +#define FD_METRICS_COUNTER_NETLNK_NEIGHBOR_SOLICITS_FAILS_TYPE (FD_METRICS_TYPE_COUNTER) +#define FD_METRICS_COUNTER_NETLNK_NEIGHBOR_SOLICITS_FAILS_DESC "Number of neighbor solicit requests that failed to send" +#define FD_METRICS_COUNTER_NETLNK_NEIGHBOR_SOLICITS_FAILS_CVT (FD_METRICS_CONVERTER_NONE) + +#define FD_METRICS_NETLNK_TOTAL (11UL) +extern const fd_metrics_meta_t FD_METRICS_NETLNK[FD_METRICS_NETLNK_TOTAL]; diff --git a/src/disco/metrics/metrics.xml b/src/disco/metrics/metrics.xml index ac51fe12c7..6c47172f41 100644 --- a/src/disco/metrics/metrics.xml +++ b/src/disco/metrics/metrics.xml @@ -588,4 +588,28 @@ metric introduced. + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/disco/netlink/Local.mk b/src/disco/netlink/Local.mk new file mode 100644 index 0000000000..c1e78d7145 --- /dev/null +++ b/src/disco/netlink/Local.mk @@ -0,0 +1,6 @@ +ifdef FD_HAS_LINUX +ifdef FD_HAS_SSE +$(call add-hdrs,fd_netlink_tile.h) +$(call add-objs,fd_netlink_tile,fd_disco) +endif +endif diff --git a/src/disco/netlink/fd_netlink_tile.c b/src/disco/netlink/fd_netlink_tile.c new file mode 100644 index 0000000000..c8acbe6f16 --- /dev/null +++ b/src/disco/netlink/fd_netlink_tile.c @@ -0,0 +1,383 @@ +#include "fd_netlink_tile_private.h" +#include "../topo/fd_topo.h" +#include "../topo/fd_topob.h" +#include "../topo/fd_pod_format.h" +#include "generated/netlink_seccomp.h" +#include "../metrics/fd_metrics.h" +#include "../../waltz/ip/fd_fib4_netlink.h" +#include "../../waltz/mib/fd_netdev_netlink.h" +#include "../../waltz/neigh/fd_neigh4_netlink.h" +#include "../../app/fdctl/config.h" /* FIXME inverse dependency */ +#include "../../util/log/fd_dtrace.h" + +#include +#include +#include /* SOL_{...} */ +#include /* getrandom */ +#include /* struct timeval */ +#include /* RTM_{...} */ + +/* Hardcoded limits */ +#define NETDEV_MAX (256U) +#define BOND_MASTER_MAX (256U) + +void +fd_netlink_topo_create( fd_topo_tile_t * netlink_tile, + fd_topo_t * topo, + struct fdctl_config const * config ) { + fd_topo_obj_t * netdev_dbl_buf_obj = fd_topob_obj( topo, "dbl_buf", "netbase" ); + fd_topo_obj_t * fib4_main_obj = fd_topob_obj( topo, "fib4", "netbase" ); + fd_topo_obj_t * fib4_local_obj = fd_topob_obj( topo, "fib4", "netbase" ); + fd_topo_obj_t * neigh4_obj = fd_topob_obj( topo, "neigh4_hmap", "netbase" ); + fd_topo_obj_t * neigh4_ele_obj = fd_topob_obj( topo, "opaque", "netbase" ); + + fd_topob_tile_uses( topo, netlink_tile, netdev_dbl_buf_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); + fd_topob_tile_uses( topo, netlink_tile, fib4_main_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); + fd_topob_tile_uses( topo, netlink_tile, fib4_local_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); + fd_topob_tile_uses( topo, netlink_tile, neigh4_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); + fd_topob_tile_uses( topo, netlink_tile, neigh4_ele_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); + + /* Configure double buffer of netdev table */ + ulong const netdev_dbl_buf_mtu = fd_netdev_tbl_footprint( NETDEV_MAX, BOND_MASTER_MAX ); + FD_TEST( fd_pod_insertf_ulong( topo->props, netdev_dbl_buf_mtu, "obj.%lu.mtu", netdev_dbl_buf_obj->id ) ); + + /* Configure route table */ + FD_TEST( fd_pod_insertf_ulong( topo->props, config->tiles.netlink.max_routes, "obj.%lu.route_max", fib4_main_obj->id ) ); + FD_TEST( fd_pod_insertf_ulong( topo->props, config->tiles.netlink.max_routes, "obj.%lu.route_max", fib4_local_obj->id ) ); + + /* Configure neighbor hashmap: Open addressed hashmap with 3.0 sparsity + factor and 16 long probe chain */ + uint const neigh_if_idx = if_nametoindex( config->tiles.net.interface ); + if( FD_UNLIKELY( !neigh_if_idx ) ) FD_LOG_ERR(( "if_nametoindex(%s) failed (%i-%s)", config->tiles.net.interface, errno, fd_io_strerror( errno ) )); + ulong const neigh_ele_max = fd_ulong_pow2_up( 3UL * config->tiles.netlink.max_neighbors ); + ulong const neigh_ele_align = alignof(fd_neigh4_entry_t); + ulong const neigh_ele_fp = neigh_ele_max * sizeof(fd_neigh4_entry_t); + FD_TEST( fd_pod_insertf_ulong( topo->props, neigh_ele_max, "obj.%lu.ele_max", neigh4_obj->id ) ); + FD_TEST( fd_pod_insertf_ulong( topo->props, 16UL, "obj.%lu.probe_max", neigh4_obj->id ) ); + FD_TEST( fd_pod_insertf_ulong( topo->props, 4UL, "obj.%lu.lock_cnt", neigh4_obj->id ) ); + FD_TEST( fd_pod_insertf_ulong( topo->props, neigh_ele_align, "obj.%lu.align", neigh4_ele_obj->id ) ); + FD_TEST( fd_pod_insertf_ulong( topo->props, neigh_ele_fp, "obj.%lu.footprint", neigh4_ele_obj->id ) ); + + /* Pick a random hashmap seed */ + ulong seed; + FD_TEST( 8UL==getrandom( &seed, sizeof(ulong), 0 ) ); + FD_TEST( fd_pod_insertf_ulong( topo->props, seed, "obj.%lu.seed", neigh4_obj->id ) ); + + netlink_tile->netlink.netdev_dbl_buf_obj_id = netdev_dbl_buf_obj->id; + netlink_tile->netlink.fib4_main_obj_id = fib4_main_obj->id; + netlink_tile->netlink.fib4_local_obj_id = fib4_local_obj->id; + netlink_tile->netlink.neigh_if_idx = neigh_if_idx; + netlink_tile->netlink.neigh4_obj_id = neigh4_obj->id; + netlink_tile->netlink.neigh4_ele_obj_id = neigh4_ele_obj->id; +} + +void +fd_netlink_topo_join( fd_topo_t * topo, + fd_topo_tile_t * netlink_tile, + fd_topo_tile_t * join_tile ) { + fd_topob_tile_uses( topo, join_tile, &topo->objs[ netlink_tile->netlink.neigh4_obj_id ], FD_SHMEM_JOIN_MODE_READ_ONLY ); + fd_topob_tile_uses( topo, join_tile, &topo->objs[ netlink_tile->netlink.neigh4_ele_obj_id ], FD_SHMEM_JOIN_MODE_READ_ONLY ); + fd_topob_tile_uses( topo, join_tile, &topo->objs[ netlink_tile->netlink.fib4_main_obj_id ], FD_SHMEM_JOIN_MODE_READ_ONLY ); + fd_topob_tile_uses( topo, join_tile, &topo->objs[ netlink_tile->netlink.fib4_local_obj_id ], FD_SHMEM_JOIN_MODE_READ_ONLY ); +} + +/* Timing details: + + Housekeeping is done every 97ms. + Socket receives block up to 43ms. */ + +/* Begin tile methods */ + +FD_FN_CONST static inline ulong +scratch_align( void ) { + return fd_ulong_max( alignof(fd_netlink_tile_ctx_t), FD_NETDEV_TBL_ALIGN ); +} + +FD_FN_PURE static inline ulong +scratch_footprint( fd_topo_tile_t const * tile ) { + (void)tile; + ulong l = FD_LAYOUT_INIT; + l = FD_LAYOUT_APPEND( l, alignof(fd_netlink_tile_ctx_t), sizeof(fd_netlink_tile_ctx_t) ); + l = FD_LAYOUT_APPEND( l, fd_netdev_tbl_align(), fd_netdev_tbl_footprint( NETDEV_MAX, BOND_MASTER_MAX ) ); + return FD_LAYOUT_FINI( l, scratch_align() ); +} + +static ulong +populate_allowed_seccomp( fd_topo_t const * topo, + fd_topo_tile_t const * tile, + ulong out_cnt, + struct sock_filter * out ) { + fd_netlink_tile_ctx_t * ctx = fd_topo_obj_laddr( topo, tile->tile_obj_id ); + FD_TEST( ctx->magic==FD_NETLINK_TILE_CTX_MAGIC ); + populate_sock_filter_policy_netlink( out_cnt, out, (uint)fd_log_private_logfile_fd(), (uint)ctx->nl_monitor->fd, (uint)ctx->nl_req->fd ); + return sock_filter_policy_netlink_instr_cnt; +} + +static ulong +populate_allowed_fds( fd_topo_t const * topo, + fd_topo_tile_t const * tile, + ulong out_fds_cnt, + int * out_fds ) { + fd_netlink_tile_ctx_t * ctx = fd_topo_obj_laddr( topo, tile->tile_obj_id ); + FD_TEST( ctx->magic==FD_NETLINK_TILE_CTX_MAGIC ); + + if( FD_UNLIKELY( out_fds_cnt<4UL ) ) FD_LOG_ERR(( "out_fds_cnt too low (%lu)", out_fds_cnt )); + + ulong out_cnt = 0UL; + out_fds[ out_cnt++ ] = 2; /* stderr */ + if( FD_LIKELY( -1!=fd_log_private_logfile_fd() ) ) + out_fds[ out_cnt++ ] = fd_log_private_logfile_fd(); /* logfile */ + out_fds[ out_cnt++ ] = ctx->nl_monitor->fd; + out_fds[ out_cnt++ ] = ctx->nl_req->fd; + return out_cnt; +} + +static void +privileged_init( fd_topo_t * topo, + fd_topo_tile_t * tile ) { + if( FD_UNLIKELY( tile->kind_id!=0 ) ) { + FD_LOG_ERR(( "Topology contains more than one netlink tile" )); + } + + fd_netlink_tile_ctx_t * ctx = fd_topo_obj_laddr( topo, tile->tile_obj_id ); + /* FIXME zero memory? */ + ctx->magic = FD_NETLINK_TILE_CTX_MAGIC; + ctx->neigh4_ifidx = tile->netlink.neigh_if_idx; + + if( FD_UNLIKELY( !fd_netlink_init( ctx->nl_monitor, 1000U ) ) ) { + FD_LOG_ERR(( "Failed to connect to rtnetlink" )); + } + if( FD_UNLIKELY( !fd_netlink_init( ctx->nl_req, 9000000U ) ) ) { + FD_LOG_ERR(( "Failed to connect to rtnetlink" )); + } + + union { + struct sockaddr sa; + struct sockaddr_nl sanl; + } sa; + sa.sanl = (struct sockaddr_nl) { + .nl_family = AF_NETLINK, + .nl_groups = RTMGRP_LINK | RTMGRP_NEIGH | RTMGRP_IPV4_ROUTE + }; + if( FD_UNLIKELY( 0!=bind( ctx->nl_monitor->fd, &sa.sa, sizeof(struct sockaddr_nl) ) ) ) { + FD_LOG_ERR(( "bind(sock,RT_NETLINK,RTMGRP_{LINK,NEIGH,IPV4_ROUTE}) failed (%i-%s)", errno, fd_io_strerror( errno ) )); + } + + struct timeval tv = { .tv_usec = 43000, }; /* 43ms */ + if( FD_UNLIKELY( 0!=setsockopt( ctx->nl_monitor->fd, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(struct timeval) ) ) ) { + FD_LOG_ERR(( "setsockopt(sock,SOL_SOCKET,SO_RCVTIMEO) failed (%i-%s)", errno, fd_io_strerror( errno ) )); + } +} + +static void +unprivileged_init( fd_topo_t * topo, + fd_topo_tile_t * tile ) { + FD_SCRATCH_ALLOC_INIT( l, fd_topo_obj_laddr( topo, tile->tile_obj_id ) ); + fd_netlink_tile_ctx_t * ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_netlink_tile_ctx_t), sizeof(fd_netlink_tile_ctx_t) ); + FD_TEST( ctx->magic==FD_NETLINK_TILE_CTX_MAGIC ); + ctx->netdev_sz = fd_netdev_tbl_footprint( NETDEV_MAX, BOND_MASTER_MAX ); + ctx->netdev_local = FD_SCRATCH_ALLOC_APPEND( l, fd_netdev_tbl_align(), ctx->netdev_sz ); + + FD_TEST( tile->netlink.netdev_dbl_buf_obj_id ); + FD_TEST( tile->netlink.neigh4_obj_id ); + FD_TEST( tile->netlink.neigh4_ele_obj_id ); + FD_TEST( tile->netlink.fib4_local_obj_id ); + FD_TEST( tile->netlink.fib4_main_obj_id ); + + FD_TEST( fd_netdev_tbl_new( ctx->netdev_local, NETDEV_MAX, BOND_MASTER_MAX ) ); + FD_TEST( fd_netdev_tbl_join( ctx->netdev_tbl, ctx->netdev_local ) ); + + FD_TEST( ctx->netdev_buf = fd_dbl_buf_join( fd_topo_obj_laddr( topo, tile->netlink.netdev_dbl_buf_obj_id ) ) ); + + FD_TEST( fd_neigh4_hmap_join( ctx->neigh4, fd_topo_obj_laddr( topo, tile->netlink.neigh4_obj_id ), fd_topo_obj_laddr( topo, tile->netlink.neigh4_ele_obj_id ) ) ); + ctx->fib4_local = fd_fib4_join( fd_topo_obj_laddr( topo, tile->netlink.fib4_local_obj_id ) ); FD_TEST( ctx->fib4_local ); + ctx->fib4_main = fd_fib4_join( fd_topo_obj_laddr( topo, tile->netlink.fib4_main_obj_id ) ); FD_TEST( ctx->fib4_main ); + + for( ulong i=0UL; iin_cnt; i++ ) { + fd_topo_link_t * link = &topo->links[ tile->in_link_id[ i ] ]; + if( FD_UNLIKELY( link->mtu!=0UL ) ) FD_LOG_ERR(( "netlink solicit links must have an MTU of zero" )); + } + + ctx->action |= FD_NET_TILE_ACTION_LINK_UPDATE; + ctx->action |= FD_NET_TILE_ACTION_ROUTE4_UPDATE; + ctx->action |= FD_NET_TILE_ACTION_NEIGH_UPDATE; + + ctx->update_backoff = (long)( fd_tempo_tick_per_ns( NULL ) * 10e6 ); /* 10ms */ +} + +/* Begin stem methods + + Note: Using stem here might seem odd since fd_netlink_tile does not + send or receive any messages. Use of stem here is justified because of + the initialization, generic metrics, and event loop functionality it + provides. */ + +static inline void +metrics_write( fd_netlink_tile_ctx_t * ctx ) { + FD_MCNT_SET( NETLNK, DROP_EVENTS, fd_netlink_enobufs_cnt ); + FD_MCNT_SET( NETLNK, LINK_FULL_SYNCS, ctx->metrics.link_full_syncs ); + FD_MCNT_SET( NETLNK, ROUTE_FULL_SYNCS, ctx->metrics.route_full_syncs ); + FD_MCNT_ENUM_COPY( NETLNK, UPDATES, ctx->metrics.update_cnt ); + FD_MGAUGE_SET( NETLNK, INTERFACE_COUNT, ctx->netdev_tbl->hdr->dev_cnt ); + FD_MGAUGE_SET( NETLNK, ROUTE_COUNT_LOCAL, fd_fib4_cnt( ctx->fib4_local ) ); + FD_MGAUGE_SET( NETLNK, ROUTE_COUNT_MAIN, fd_fib4_cnt( ctx->fib4_main ) ); + FD_MCNT_SET( NETLNK, NEIGHBOR_SOLICITS_SENT, ctx->metrics.neigh_solicits_sent ); + FD_MCNT_SET( NETLNK, NEIGHBOR_SOLICITS_FAILS, ctx->metrics.neigh_solicits_fails ); +} + +static inline void +during_housekeeping( fd_netlink_tile_ctx_t * ctx ) { + long now = fd_tickcount(); + if( ctx->action & FD_NET_TILE_ACTION_LINK_UPDATE ) { + if( now < ctx->link_update_ts ) return; + ctx->action &= ~FD_NET_TILE_ACTION_LINK_UPDATE; + fd_netdev_netlink_load_table( ctx->netdev_tbl, ctx->nl_req ); + fd_dbl_buf_insert( ctx->netdev_buf, ctx->netdev_local, ctx->netdev_sz ); + ctx->link_update_ts = now+ctx->update_backoff; + ctx->metrics.link_full_syncs++; + } + if( ctx->action & FD_NET_TILE_ACTION_ROUTE4_UPDATE ) { + if( now < ctx->route4_update_ts ) return; + ctx->action &= ~FD_NET_TILE_ACTION_ROUTE4_UPDATE; + fd_fib4_netlink_load_table( ctx->fib4_local, ctx->nl_req, RT_TABLE_LOCAL ); + fd_fib4_netlink_load_table( ctx->fib4_main, ctx->nl_req, RT_TABLE_MAIN ); + ctx->route4_update_ts = now+ctx->update_backoff; + ctx->metrics.route_full_syncs++; + } + if( ctx->action & FD_NET_TILE_ACTION_NEIGH_UPDATE ) { + ctx->action &= ~FD_NET_TILE_ACTION_NEIGH_UPDATE; + fd_neigh4_netlink_request_dump( ctx->nl_req, ctx->neigh4_ifidx ); + uchar buf[ 4096 ]; + fd_netlink_iter_t iter[1]; + for( fd_netlink_iter_init( iter, ctx->nl_req, buf, sizeof(buf) ); + !fd_netlink_iter_done( iter ); + fd_netlink_iter_next( iter, ctx->nl_req ) ) { + fd_neigh4_netlink_ingest_message( ctx->neigh4, fd_netlink_iter_msg( iter ), ctx->neigh4_ifidx ); + } + } +} + +static inline void +before_credit( fd_netlink_tile_ctx_t * ctx, + fd_stem_context_t * stem, + int * charge_busy ) { + (void)stem; + + uchar msg[ 16384 ]; + long msg_sz = recvfrom( ctx->nl_monitor->fd, msg, sizeof(msg), 0, NULL, NULL ); + if( msg_sz<=0L ) return; + + /* FIXME the reported busy% should not include any wait time */ + *charge_busy = 1; + + struct nlmsghdr * nlh = fd_type_pun( msg ); + FD_DTRACE_PROBE_4( netlink_update, nlh->nlmsg_seq, nlh->nlmsg_type, nlh->nlmsg_len, nlh->nlmsg_flags ); + switch( nlh->nlmsg_type ) { + case RTM_NEWLINK: + case RTM_DELLINK: + ctx->action |= FD_NET_TILE_ACTION_LINK_UPDATE; + ctx->metrics.update_cnt[ FD_METRICS_ENUM_NETLINK_MSG_V_LINK_IDX ]++; + break; + case RTM_NEWROUTE: + case RTM_DELROUTE: + ctx->action |= FD_NET_TILE_ACTION_ROUTE4_UPDATE; + ctx->metrics.update_cnt[ FD_METRICS_ENUM_NETLINK_MSG_V_IPV4_ROUTE_IDX ]++; + break; + case RTM_NEWNEIGH: + case RTM_DELNEIGH: { + fd_neigh4_netlink_ingest_message( ctx->neigh4, nlh, ctx->neigh4_ifidx ); + ctx->metrics.update_cnt[ FD_METRICS_ENUM_NETLINK_MSG_V_NEIGH_IDX ]++; + break; + } + default: + FD_LOG_INFO(( "Received unexpected netlink message type %u", nlh->nlmsg_type )); + break; + } + +} + +/* after_frag handles a neighbor solicit request */ + +static void +after_frag( fd_netlink_tile_ctx_t * ctx, + ulong in_idx, + ulong seq, + ulong sig, + ulong sz, + ulong tsorig, + fd_stem_context_t * stem ) { + (void)in_idx; (void)seq; (void)tsorig; (void)stem; + + /* Parse request (fully contained in sig field) */ + + if( FD_UNLIKELY( sz!=0UL ) ) { + FD_LOG_WARNING(( "unexpected sz %lu", sz )); + } + if( FD_UNLIKELY( sig>>48 ) ) { + FD_LOG_WARNING(( "unexpected high bits in sig %016lx", sig )); + } + ushort if_idx = (ushort)(sig>>32); + uint ip4_addr = (uint)sig; + if( FD_UNLIKELY( if_idx!=ctx->neigh4_ifidx ) ) { + ctx->metrics.neigh_solicits_fails++; + return; + } + + /* Drop if the kernel is already working on the request */ + + fd_neigh4_hmap_query_t query[1]; + int spec_res = fd_neigh4_hmap_query_try( ctx->neigh4, &ip4_addr, NULL, query, 0 ); + if( spec_res==FD_MAP_SUCCESS ) return; + + /* Insert placeholder (take above branch next time) */ + + int prepare_res = fd_neigh4_hmap_prepare( ctx->neigh4, &ip4_addr, NULL, query, 0 ); + if( FD_UNLIKELY( prepare_res!=FD_MAP_SUCCESS ) ) { + ctx->metrics.neigh_solicits_fails++; + return; + } + fd_neigh4_entry_t * ele = fd_neigh4_hmap_query_ele( query ); + ele->state = FD_NEIGH4_STATE_INCOMPLETE; + ele->ip4_addr = ip4_addr; + memset( ele->mac_addr, 0, 6UL ); + fd_neigh4_hmap_publish( query ); + + /* Trigger neighbor solicit via netlink */ + + int netlink_res = fd_neigh4_netlink_solicit( ctx->nl_req, if_idx, ip4_addr ); + if( FD_UNLIKELY( netlink_res<0 ) ) { + ctx->metrics.neigh_solicits_fails++; + return; + } + + ctx->metrics.neigh_solicits_sent++; + +} + +#define STEM_BURST (1UL) +#define STEM_LAZY ((ulong)97e6) /* 97ms */ + +#define STEM_CALLBACK_CONTEXT_TYPE fd_netlink_tile_ctx_t +#define STEM_CALLBACK_CONTEXT_ALIGN alignof(fd_netlink_tile_ctx_t) + +#define STEM_CALLBACK_METRICS_WRITE metrics_write +#define STEM_CALLBACK_DURING_HOUSEKEEPING during_housekeeping +#define STEM_CALLBACK_BEFORE_CREDIT before_credit +#define STEM_CALLBACK_AFTER_FRAG after_frag + +#include "../stem/fd_stem.c" + +/* End stem methods */ + +fd_topo_run_tile_t fd_tile_netlink = { + .name = "netlnk", + .populate_allowed_seccomp = populate_allowed_seccomp, + .populate_allowed_fds = populate_allowed_fds, + .scratch_align = scratch_align, + .scratch_footprint = scratch_footprint, + .privileged_init = privileged_init, + .unprivileged_init = unprivileged_init, + .run = stem_run +}; + +/* FIXME handle ENOBUFS */ diff --git a/src/disco/netlink/fd_netlink_tile.h b/src/disco/netlink/fd_netlink_tile.h new file mode 100644 index 0000000000..fe94bc5db1 --- /dev/null +++ b/src/disco/netlink/fd_netlink_tile.h @@ -0,0 +1,70 @@ +#ifndef HEADER_fd_src_disco_netlink_fd_netlink_tile_h +#define HEADER_fd_src_disco_netlink_fd_netlink_tile_h + +/* fd_netlink_tile.h provides APIs for working with the netlink tile. */ + +#include "../topo/fd_topo.h" + +/* fd_tile_netlink provides the netlink tile. + + Consult /book/guide/netlink.md for more information. + Web mirror: https://docs.firedancer.io/guide/netlink.html */ + +FD_PROTOTYPES_BEGIN +extern fd_topo_run_tile_t fd_tile_netlink; +FD_PROTOTYPES_END + +/* fd_netlink_neigh4_solicit_link_t holds information required to send + neighbor solicitation requests to the netlink tile. */ + +struct fd_netlink_neigh4_solicit_link { + fd_frag_meta_t * mcache; + ulong depth; + ulong seq; +}; + +typedef struct fd_netlink_neigh4_solicit_link fd_netlink_neigh4_solicit_link_t; + +struct fdctl_config; + +FD_PROTOTYPES_BEGIN + +void +fd_netlink_topo_create( fd_topo_tile_t * netlink_tile, + fd_topo_t * topo, + struct fdctl_config const * config ); + +void +fd_netlink_topo_join( fd_topo_t * topo, + fd_topo_tile_t * netlink_tile, + fd_topo_tile_t * join_tile ); + +/* fd_netlink_neigh4_solicit{,_sse} requests a neighbor solicitation (i.e. + ARP request) for an IPv4 address. Safe to call at a high rate. The + netlink tile will deduplicate requests. ip4_addr is big endian. */ + +static inline void +fd_netlink_neigh4_solicit( fd_netlink_neigh4_solicit_link_t * link, + uint ip4_addr, + ulong tspub_comp ) { + ulong seq = link->seq; + ulong sig = ip4_addr; + fd_mcache_publish( link->mcache, link->depth, seq, sig, 0UL, 0UL, 0UL, 0UL, tspub_comp ); + link->seq = fd_seq_inc( seq, 1UL ); +} + +#if FD_HAS_SSE +static inline void +fd_netlink_neigh4_solicit_sse( fd_netlink_neigh4_solicit_link_t * link, + uint ip4_addr, + ulong tspub_comp ) { + ulong seq = link->seq; + ulong sig = ip4_addr; + fd_mcache_publish_sse( link->mcache, link->depth, seq, sig, 0UL, 0UL, 0UL, 0UL, tspub_comp ); + link->seq = fd_seq_inc( seq, 1UL ); +} +#endif /* FD_HAS_SSE */ + +FD_PROTOTYPES_END + +#endif /* HEADER_fd_src_disco_netlink_fd_netlink_tile_h */ diff --git a/src/disco/netlink/fd_netlink_tile_private.h b/src/disco/netlink/fd_netlink_tile_private.h new file mode 100644 index 0000000000..ebbe806314 --- /dev/null +++ b/src/disco/netlink/fd_netlink_tile_private.h @@ -0,0 +1,58 @@ +#ifndef HEADER_fd_src_disco_netlink_fd_netlink_tile_private_h +#define HEADER_fd_src_disco_netlink_fd_netlink_tile_private_h + +#include "../../waltz/ip/fd_netlink1.h" +#include "../metrics/generated/fd_metrics_netlnk.h" +#include "../../waltz/ip/fd_fib4.h" +#include "../../waltz/mib/fd_dbl_buf.h" +#include "../../waltz/mib/fd_netdev_tbl.h" +#include "../../waltz/neigh/fd_neigh4_map.h" + +/* FD_NETLINK_TILE_CTX_MAGIC uniquely identifies a fd_netlink_tile_ctx_t. + CHange this whenever the fd_netlink_tile_ctx_t struct changes. */ + +#define FD_NETLINK_TILE_CTX_MAGIC (0xec431bf97929c691UL) /* random */ + +struct fd_netlink_tile_ctx { + ulong magic; /* ==FD_NETLINK_TILE_CTX_MAGIC */ + + fd_netlink_t nl_monitor[1]; + fd_netlink_t nl_req[1]; + + /* Pending actions */ + ulong action; +# define FD_NET_TILE_ACTION_ROUTE4_UPDATE (1UL<<0) +# define FD_NET_TILE_ACTION_LINK_UPDATE (1UL<<1) +# define FD_NET_TILE_ACTION_NEIGH_UPDATE (1UL<<2) + + /* Rate limit link and route table changes (in ticks) */ + long update_backoff; + long route4_update_ts; + long link_update_ts; + + /* Link table */ + void * netdev_local; /* local mutable table */ + ulong netdev_sz; /* size of netdev table */ + fd_netdev_tbl_join_t netdev_tbl[1]; /* join to local mutable table */ + fd_dbl_buf_t * netdev_buf; /* global immutable copy */ + + /* Route tables */ + fd_fib4_t * fib4_local; + fd_fib4_t * fib4_main; + + /* Neighbor table */ + fd_neigh4_hmap_t neigh4[1]; + uint neigh4_ifidx; + + struct { + ulong link_full_syncs; + ulong route_full_syncs; + ulong update_cnt[ FD_METRICS_COUNTER_NETLNK_UPDATES_CNT ]; + ulong neigh_solicits_sent; + ulong neigh_solicits_fails; + } metrics; +}; + +typedef struct fd_netlink_tile_ctx fd_netlink_tile_ctx_t; + +#endif /* HEADER_fd_src_disco_netlink_fd_netlink_tile_private_h */ diff --git a/src/disco/netlink/generated/netlink_seccomp.h b/src/disco/netlink/generated/netlink_seccomp.h new file mode 100644 index 0000000000..369efe7009 --- /dev/null +++ b/src/disco/netlink/generated/netlink_seccomp.h @@ -0,0 +1,102 @@ +/* THIS FILE WAS GENERATED BY generate_filters.py. DO NOT EDIT BY HAND! */ +#ifndef HEADER_fd_src_disco_netlink_generated_netlink_seccomp_h +#define HEADER_fd_src_disco_netlink_generated_netlink_seccomp_h + +#include "../../../../src/util/fd_util_base.h" +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(__i386__) +# define ARCH_NR AUDIT_ARCH_I386 +#elif defined(__x86_64__) +# define ARCH_NR AUDIT_ARCH_X86_64 +#elif defined(__aarch64__) +# define ARCH_NR AUDIT_ARCH_AARCH64 +#else +# error "Target architecture is unsupported by seccomp." +#endif +static const unsigned int sock_filter_policy_netlink_instr_cnt = 34; + +static void populate_sock_filter_policy_netlink( ulong out_cnt, struct sock_filter * out, unsigned int logfile_fd, unsigned int nl_mon_fd, unsigned int nl_req_fd) { + FD_TEST( out_cnt >= 34 ); + struct sock_filter filter[34] = { + /* Check: Jump to RET_KILL_PROCESS if the script's arch != the runtime arch */ + BPF_STMT( BPF_LD | BPF_W | BPF_ABS, ( offsetof( struct seccomp_data, arch ) ) ), + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, ARCH_NR, 0, /* RET_KILL_PROCESS */ 30 ), + /* loading syscall number in accumulator */ + BPF_STMT( BPF_LD | BPF_W | BPF_ABS, ( offsetof( struct seccomp_data, nr ) ) ), + /* allow write based on expression */ + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, SYS_write, /* check_write */ 4, 0 ), + /* allow fsync based on expression */ + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, SYS_fsync, /* check_fsync */ 7, 0 ), + /* allow sendto based on expression */ + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, SYS_sendto, /* check_sendto */ 8, 0 ), + /* allow recvfrom based on expression */ + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, SYS_recvfrom, /* check_recvfrom */ 15, 0 ), + /* none of the syscalls matched */ + { BPF_JMP | BPF_JA, 0, 0, /* RET_KILL_PROCESS */ 24 }, +// check_write: + /* load syscall argument 0 in accumulator */ + BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[0])), + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 2, /* RET_ALLOW */ 23, /* lbl_1 */ 0 ), +// lbl_1: + /* load syscall argument 0 in accumulator */ + BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[0])), + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, logfile_fd, /* RET_ALLOW */ 21, /* RET_KILL_PROCESS */ 20 ), +// check_fsync: + /* load syscall argument 0 in accumulator */ + BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[0])), + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, logfile_fd, /* RET_ALLOW */ 19, /* RET_KILL_PROCESS */ 18 ), +// check_sendto: + /* load syscall argument 0 in accumulator */ + BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[0])), + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, nl_req_fd, /* lbl_2 */ 0, /* RET_KILL_PROCESS */ 16 ), +// lbl_2: + /* load syscall argument 3 in accumulator */ + BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[3])), + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 0, /* lbl_3 */ 0, /* RET_KILL_PROCESS */ 14 ), +// lbl_3: + /* load syscall argument 4 in accumulator */ + BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[4])), + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 0, /* lbl_4 */ 0, /* RET_KILL_PROCESS */ 12 ), +// lbl_4: + /* load syscall argument 5 in accumulator */ + BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[5])), + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 0, /* RET_ALLOW */ 11, /* RET_KILL_PROCESS */ 10 ), +// check_recvfrom: + /* load syscall argument 0 in accumulator */ + BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[0])), + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, nl_mon_fd, /* lbl_5 */ 2, /* lbl_6 */ 0 ), +// lbl_6: + /* load syscall argument 0 in accumulator */ + BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[0])), + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, nl_req_fd, /* lbl_5 */ 0, /* RET_KILL_PROCESS */ 6 ), +// lbl_5: + /* load syscall argument 3 in accumulator */ + BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[3])), + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 0, /* lbl_7 */ 0, /* RET_KILL_PROCESS */ 4 ), +// lbl_7: + /* load syscall argument 4 in accumulator */ + BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[4])), + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 0, /* lbl_8 */ 0, /* RET_KILL_PROCESS */ 2 ), +// lbl_8: + /* load syscall argument 5 in accumulator */ + BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[5])), + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 0, /* RET_ALLOW */ 1, /* RET_KILL_PROCESS */ 0 ), +// RET_KILL_PROCESS: + /* KILL_PROCESS is placed before ALLOW since it's the fallthrough case. */ + BPF_STMT( BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS ), +// RET_ALLOW: + /* ALLOW has to be reached by jumping */ + BPF_STMT( BPF_RET | BPF_K, SECCOMP_RET_ALLOW ), + }; + fd_memcpy( out, filter, sizeof( filter ) ); +} + +#endif diff --git a/src/disco/netlink/netlink.seccomppolicy b/src/disco/netlink/netlink.seccomppolicy new file mode 100644 index 0000000000..4269886c49 --- /dev/null +++ b/src/disco/netlink/netlink.seccomppolicy @@ -0,0 +1,37 @@ +# logfile_fd: It can be disabled by configuration, but typically tiles +# will open a log file on boot and write all messages there. +# +# nl_mon_fd: An rtnetlink socket used to monitor updates +# nl_req_fd: An rtnetlink socket used for request-reply +unsigned int logfile_fd, unsigned int nl_mon_fd, unsigned int nl_req_fd + +# logging: all log messages are written to a file and/or pipe +# +# 'WARNING' and above are written to the STDERR pipe, while all messages +# are always written to the log file. +# +# arg 0 is the file descriptor to write to. The boot process ensures +# that descriptor 2 is always STDERR and descriptor 4 is the logfile. +write: (or (eq (arg 0) 2) + (eq (arg 0) logfile_fd)) + +# logging: 'WARNING' and above fsync the logfile to disk immediately +# +# arg 0 is the file descriptor to fsync. The boot process ensures that +# descriptor 3 is always the logfile. +fsync: (eq (arg 0) logfile_fd) + +# sendto(2) is used to send netlink requests to the kernel +# (In theory could use send(2) but that syscall doesn't exist on arm64) +sendto: (and (eq (arg 0) nl_req_fd) + (eq (arg 3) 0) + (eq (arg 4) 0) + (eq (arg 5) 0)) + +# recvfrom(2) is used to receive netlink responses from the kernel +# (Using recvfrom(2) instead of recv(2) for same ABI reasons as above) +recvfrom: (and (or (eq (arg 0) nl_mon_fd) + (eq (arg 0) nl_req_fd)) + (eq (arg 3) 0) + (eq (arg 4) 0) + (eq (arg 5) 0)) diff --git a/src/disco/topo/fd_topo.h b/src/disco/topo/fd_topo.h index 69977878c4..ae954d522d 100644 --- a/src/disco/topo/fd_topo.h +++ b/src/disco/topo/fd_topo.h @@ -141,12 +141,17 @@ typedef struct { ushort gossip_listen_port; ushort repair_intake_listen_port; ushort repair_serve_listen_port; - - /* multihoming support */ - ulong multihome_ip_addrs_cnt; - uint multihome_ip_addrs[FD_NET_MAX_SRC_ADDR]; } net; + struct { + ulong netdev_dbl_buf_obj_id; /* dbl_buf containing netdev_tbl */ + ulong fib4_main_obj_id; /* fib4 containing main route table */ + ulong fib4_local_obj_id; /* fib4 containing local route table */ + uint neigh_if_idx; /* neigh4 interface index */ + ulong neigh4_obj_id; /* neigh4 hash map header */ + ulong neigh4_ele_obj_id; /* neigh4 hash map slots */ + } netlink; + struct { uint out_depth; uint reasm_cnt; diff --git a/src/disco/topo/fd_topob.c b/src/disco/topo/fd_topob.c index e29ee8c7a3..00453ca964 100644 --- a/src/disco/topo/fd_topob.c +++ b/src/disco/topo/fd_topob.c @@ -322,6 +322,7 @@ fd_topob_auto_layout( fd_topo_t * topo ) { which should be floating. */ char const * FLOATING[] = { + "netlnk", "metric", "cswtch", "bencho", diff --git a/src/waltz/ip/Local.mk b/src/waltz/ip/Local.mk index 5becc29df0..ef5c8d4e32 100644 --- a/src/waltz/ip/Local.mk +++ b/src/waltz/ip/Local.mk @@ -14,3 +14,13 @@ $(call run-unit-test,test_netlink) $(call run-unit-test,test_routing) endif endif + +$(call add-hdrs,fd_fib4.h) +$(call add-objs,fd_fib4,fd_waltz) +ifdef FD_HAS_LINUX +$(call add-objs,fd_netlink1 fd_fib4_netlink,fd_waltz) +$(call make-unit-test,test_fib4_netlink,test_fib4_netlink,fd_waltz fd_util) +$(call run-unit-test,test_fib4_netlink) +endif +$(call make-unit-test,test_fib4,test_fib4,fd_waltz fd_util) +$(call run-unit-test,test_fib4) diff --git a/src/waltz/ip/fd_fib4.c b/src/waltz/ip/fd_fib4.c new file mode 100644 index 0000000000..4e21c18631 --- /dev/null +++ b/src/waltz/ip/fd_fib4.c @@ -0,0 +1,318 @@ +#include "fd_fib4.h" +#include "fd_fib4_private.h" +#include "../../util/fd_util.h" + +/* FIXME this implementation is not completely robust against torn reads */ + +FD_FN_CONST ulong +fd_fib4_align( void ) { + return alignof(fd_fib4_t); +} + +FD_FN_CONST ulong +fd_fib4_footprint( ulong route_max ) { + if( route_max==0 || route_max>UINT_MAX ) return 0UL; + return FD_LAYOUT_FINI( FD_LAYOUT_APPEND( FD_LAYOUT_APPEND( FD_LAYOUT_APPEND( FD_LAYOUT_INIT, + alignof(fd_fib4_t), sizeof(fd_fib4_t) ), + alignof(fd_fib4_key_t), route_max*sizeof(fd_fib4_key_t) ), + alignof(fd_fib4_hop_t), route_max*sizeof(fd_fib4_hop_t) ), + alignof(fd_fib4_t) ); +} + +void * +fd_fib4_new( void * mem, + ulong route_max ) { + + if( FD_UNLIKELY( !mem ) ) { + FD_LOG_WARNING(( "NULL mem" )); + return NULL; + } + if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)mem, fd_fib4_align() ) ) ) { + FD_LOG_WARNING(( "unaligned mem" )); + return NULL; + } + if( FD_UNLIKELY( route_max==0 || route_max>UINT_MAX ) ) { + FD_LOG_WARNING(( "invalid route_max" )); + return NULL; + } + + FD_SCRATCH_ALLOC_INIT( l, mem ); + fd_fib4_t * fib4 = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_fib4_t), sizeof(fd_fib4_t) ); + fd_fib4_key_t * keys = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_fib4_key_t), route_max*sizeof(fd_fib4_key_t) ); + fd_fib4_hop_t * vals = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_fib4_hop_t), route_max*sizeof(fd_fib4_hop_t) ); + FD_SCRATCH_ALLOC_FINI( l, alignof(fd_fib4_t) ); + + fd_memset( fib4, 0, sizeof(fd_fib4_t) ); + fd_memset( keys, 0, route_max*sizeof(fd_fib4_key_t) ); + fd_memset( vals, 0, route_max*sizeof(fd_fib4_hop_t) ); + fib4->max = (uint)route_max; + fib4->hop_off = (ulong)vals - (ulong)fib4; + keys[0].prio = UINT_MAX; + + fd_fib4_clear( fib4 ); + + return fib4; +} + +fd_fib4_t * +fd_fib4_join( void * mem ) { + return (fd_fib4_t *)mem; +} + +void * +fd_fib4_leave( fd_fib4_t * fib4 ) { + return fib4; +} + +void * +fd_fib4_delete( void * mem ) { + return mem; +} + +void +fd_fib4_clear( fd_fib4_t * fib4 ) { + + /* Step 1: Make default route negative */ + + fd_fib4_hop_tbl( fib4 )->rtype = FD_FIB4_RTYPE_BLACKHOLE; + FD_COMPILER_MFENCE(); + + /* Step 2: Disable all other routes */ + + fib4->active_cnt = 1U; + FD_COMPILER_MFENCE(); + + /* Step 3: Indicate we are mid write */ + + fib4->generation++; + FD_COMPILER_MFENCE(); + + /* Step 4: Update metadata */ + + fib4->generation++; + fib4->prepare_cnt = 1U; +} + +FD_FN_PURE ulong +fd_fib4_max( fd_fib4_t const * fib ) { + return fib->max; +} + +FD_FN_PURE ulong +fd_fib4_cnt( fd_fib4_t const * fib ) { + return fib->prepare_cnt ? fib->prepare_cnt : fib->active_cnt; +} + +ulong +fd_fib4_free_cnt( fd_fib4_t const * fib ) { + if( FD_UNLIKELY( fib->prepare_cnt==0 ) ) return 0UL; + if( FD_UNLIKELY( fib->prepare_cnt > fib->max ) ) FD_LOG_CRIT(( "prepare_cnt > max" )); + return fib->max - fib->prepare_cnt; +} + +fd_fib4_hop_t * +fd_fib4_append( fd_fib4_t * fib, + uint ip4_dst, + int prefix, + uint prio ) { + if( FD_UNLIKELY( fib->prepare_cnt>=fib->max ) ) { + FD_LOG_WARNING(( "Failed to insert route, route table is full (%u max)", fib->max )); + return NULL; + } + if( FD_UNLIKELY( fib->prepare_cnt==0 ) ) { + FD_LOG_WARNING(( "Attempted to write to fib4 without lock" )); + return NULL; + } + + uint idx = fib->prepare_cnt; + fib->prepare_cnt = idx+1U; + + fd_fib4_key_t * key = fd_fib4_key_tbl( fib ) + idx; + *key = (fd_fib4_key_t) { + .addr = fd_uint_bswap( ip4_dst ), + .mask = prefix>0 ? fd_uint_mask( 32-prefix, 31 ) : 0U, + .prio = prio + }; + return fd_fib4_hop_tbl( fib ) + idx; +} + +void +fd_fib4_publish( fd_fib4_t * fib ) { + + /* Step 1: Enable new routes */ + + fib->active_cnt = fib->prepare_cnt; + FD_COMPILER_MFENCE(); + + /* Step 2: Make default route neutral */ + + fd_fib4_hop_tbl( fib )->rtype = FD_FIB4_RTYPE_THROW; + FD_COMPILER_MFENCE(); + + /* Step 3: Indicate that write is complete */ + + fib->generation++; + FD_COMPILER_MFENCE(); + + /* Step 4: Update metadata */ + + fib->prepare_cnt = 0U; +} + +fd_fib4_hop_t const * +fd_fib4_lookup( fd_fib4_t const * fib, + fd_fib4_hop_t * out, + uint ip4_dst, + ulong flags ) { + if( FD_UNLIKELY( flags ) ) { + return fd_fib4_hop_tbl_const( fib ) + 0; /* dead route */ + } + ip4_dst = fd_uint_bswap( ip4_dst ); + + ulong generation = fib->generation; + fd_fib4_key_t const * keys = fd_fib4_key_tbl_const( fib ); + FD_COMPILER_MFENCE(); + + ulong best_idx = 0UL; /* dead route */ + int best_mask = 32; /* least specific mask (/0) */ + for( ulong j=0UL; j<(fib->active_cnt); j++ ) { + /* FIXME consider branch variant? */ + int match = (ip4_dst & keys[j].mask)==keys[j].addr; + int mask_bits = fd_uint_find_lsb_w_default( keys[j].mask, 32 ); + int more_specific = mask_bits< best_mask; + int less_costly = mask_bits==best_mask && keys[j].priogeneration!=generation ) ) { + return fd_fib4_hop_tbl_const( fib ) + 0; /* dead route */ + } + return out; +} + +#if FD_HAS_HOSTED + +#include +#include +#include "../../util/net/fd_ip4.h" + +#define WRAP_PRINT(file,str) if( FD_UNLIKELY( fputs( (str), (file) )<0 ) ) return errno +#define WRAP_PRINTF(file,...) if( FD_UNLIKELY( fprintf( (file), __VA_ARGS__ )<0 ) ) return errno + +static int +fd_fib4_fprintf_route( fd_fib4_key_t const * key, + fd_fib4_hop_t const * hop, + FILE * file ) { + + switch( hop->rtype ) { + case FD_FIB4_RTYPE_UNSPEC: + WRAP_PRINT( file, "unspecified " ); + break; + case FD_FIB4_RTYPE_UNICAST: + break; + case FD_FIB4_RTYPE_LOCAL: + WRAP_PRINT( file, "local " ); + break; + case FD_FIB4_RTYPE_BROADCAST: + WRAP_PRINT( file, "broadcast " ); + break; + case FD_FIB4_RTYPE_MULTICAST: + WRAP_PRINT( file, "multicast " ); + break; + case FD_FIB4_RTYPE_BLACKHOLE: + WRAP_PRINT( file, "blackhole " ); + break; + case FD_FIB4_RTYPE_THROW: + WRAP_PRINT( file, "throw " ); + break; + default: + WRAP_PRINTF( file, "invalid (%u) ", hop->rtype ); + break; + } + + if( key->mask==0 ) { + WRAP_PRINT( file, "default" ); + } else { + WRAP_PRINTF( file, FD_IP4_ADDR_FMT, FD_IP4_ADDR_FMT_ARGS( fd_uint_bswap( key->addr ) ) ); + if( key->mask!=UINT_MAX ) { + WRAP_PRINTF( file, "/%u", 32U-(uint)fd_uint_find_lsb_w_default( key->mask, 32 ) ); + } + } + + if( hop->ip4_gw ) { + WRAP_PRINTF( file, " via " FD_IP4_ADDR_FMT, FD_IP4_ADDR_FMT_ARGS( hop->ip4_gw ) ); + } + + if( hop->if_idx ) { + WRAP_PRINTF( file, " dev %u", hop->if_idx ); + } + + switch( hop->scope ) { + case 0: + break; + case 200: + WRAP_PRINT( file, " scope site" ); + break; + case 253: + WRAP_PRINT( file, " scope link" ); + break; + case 254: + WRAP_PRINT( file, " scope host" ); + break; + default: + WRAP_PRINTF( file, " scope %u", hop->scope ); + break; + } + + if( hop->ip4_src ) { + WRAP_PRINTF( file, " src " FD_IP4_ADDR_FMT, FD_IP4_ADDR_FMT_ARGS( hop->ip4_src ) ); + } + + if( key->prio ) { + WRAP_PRINTF( file, " metric %u", key->prio ); + } + + WRAP_PRINT( file, "\n" ); + + return 0; +} + +int +fd_fib4_fprintf( fd_fib4_t const * fib, + void * file_ ) { + FILE * file = file_; + fd_fib4_key_t const * key_tbl = fd_fib4_key_tbl_const( fib ); + fd_fib4_hop_t const * hop_tbl = fd_fib4_hop_tbl_const( fib ); + + FD_COMPILER_MFENCE(); + ulong active_cnt = fib->active_cnt; + ulong generation = fib->generation; + FD_COMPILER_MFENCE(); + + for( ulong j=0UL; jgeneration ); + FD_COMPILER_MFENCE(); + if( FD_UNLIKELY( cur_gen!=generation ) ) { + WRAP_PRINT( file, "=== TORN READ ===\n" ); + return 0; + } + fd_fib4_fprintf_route( &key, &hop, file ); + } + + return 0; +} + +#undef WRAP_PRINT +#undef WRAP_PRINTF + +#endif /* FD_HAS_HOSTED */ diff --git a/src/waltz/ip/fd_fib4.h b/src/waltz/ip/fd_fib4.h new file mode 100644 index 0000000000..e71a66a905 --- /dev/null +++ b/src/waltz/ip/fd_fib4.h @@ -0,0 +1,187 @@ +#ifndef HEADER_fd_src_waltz_ip_fd_fib4_h +#define HEADER_fd_src_waltz_ip_fd_fib4_h + +/* A fib4 stores IPv4 routes in a query-optimized data structure. + + fib4 does not scale well to large numbers of routes. Every route + lookup is O(n) where n is the number of routes in the FIB. + + fib4 only supports a minimal set of features required for end devices + to operate. Packet forwarding is not supported. + + fib4 supports multi-threaded operation in a x86-TSO like environment. + (many reader threads, one writer thread) Refer to each function for + thread safety. + + A fib4 has two states: PREPARE and ACTIVE. In ACTIVE state, FIB lookups + function as expected but writes are prohibited. In PREPARE state, any + FIB lookup returns 'FD_FIB4_RTYPE_BLACKHOLE' but writes are allowed. + + A fib4 always has a dummy route at index 0. In PREPARE state, this + route is a BLACKHOLE route (terminate routing and drops the packet), + otherwise it is a THROW route (continue routing with next table). + + FIXME: CONSIDER TRIE BASED DATA STRUCTURE + + Trivia: https://en.wikipedia.org/wiki/Forwarding_information_base */ + +#include "../../util/fd_util_base.h" + +#define FD_FIB4_ALIGN (16UL) + +/* FD_FIB4_RTYPE_{...} enumerate route types. + These match Linux RTN_UNICAST, etc. */ + +#define FD_FIB4_RTYPE_UNSPEC (0) /* invalid */ +#define FD_FIB4_RTYPE_UNICAST (1) /* "normal" path */ +#define FD_FIB4_RTYPE_LOCAL (2) /* address on local host */ +#define FD_FIB4_RTYPE_BROADCAST (3) /* reserved for future use */ +#define FD_FIB4_RTYPE_MULTICAST (5) /* reserved for future use */ +#define FD_FIB4_RTYPE_BLACKHOLE (6) /* drop packet */ +#define FD_FIB4_RTYPE_THROW (9) /* continue in next table */ + +/* fd_fib4_t is a local handle to a fib4 object. Use fd_fib4_{align, + footprint,new,delete,join,leave} to construct and join a fib4. */ + +struct fd_fib4; +typedef struct fd_fib4 fd_fib4_t; + +/* fd_fib4_hop_t holds a FIB lookup result (see fd_fib4_lookup) */ + +struct __attribute__((aligned(16))) fd_fib4_hop { + uint ip4_gw; /* gateway address (big endian) */ + uint if_idx; /* output interface index */ + uint ip4_src; /* override source address (big endian). 0 implies unset */ + uchar rtype; /* route type (e.g. FD_FIB4_RTYPE_UNICAST) */ + uchar scope; /* used to select source address */ + uchar flags; /* app-specific flags */ +}; + +#define FD_FIB4_FLAG_RTA_UNSUPPORTED ((uchar)0x01U) /* unsupported route attribute */ +#define FD_FIB4_FLAG_RTA_PARSE_ERR ((uchar)0x02U) /* failed to interpret route attribute */ +#define FD_FIB4_FLAG_RTYPE_UNSUPPORTED ((uchar)0x03U) /* unsupported route type */ + +typedef struct fd_fib4_hop fd_fib4_hop_t; + +FD_PROTOTYPES_BEGIN + +/* Constructor APIs ******************************************************/ + +FD_FN_CONST ulong +fd_fib4_align( void ); + +FD_FN_CONST ulong +fd_fib4_footprint( ulong route_max ); + +void * +fd_fib4_new( void * mem, + ulong route_max ); + +fd_fib4_t * +fd_fib4_join( void * mem ); + +void * +fd_fib4_leave( fd_fib4_t * fib4 ); + +void * +fd_fib4_delete( void * mem ); + +/* Write APIs ************************************************************* + + Currently, any updates to a fib4 require a full rewrite (incremental + updates are not supported). During an update, fd_fib4_lookup calls + temporarily return a route entry with FD_FIB4_RTYPE_BLACKHOLE, which + means outgoing packets get dropped. (This is preferable to potentially + making an incorrect routing decision based on a partial route table.) + + Example usage: + + fd_fib4_clear() + ... multiple calls to fd_fib4_append() ... + fd_fib4_publish() */ + +/* fd_fib4_clear removes all route entries. Transitions the fib to PREPARE + state. */ + +void +fd_fib4_clear( fd_fib4_t * fib ); + +/* fd_fib4_max returns the max number of routes in the table. */ + +FD_FN_PURE ulong +fd_fib4_max( fd_fib4_t const * fib ); + +/* fd_fib4_cnt returns the number of routes in the table. In PREPARE state + returns the number of pending routes, in ACTIVE state returns the number + of active routes. */ + +FD_FN_PURE ulong +fd_fib4_cnt( fd_fib4_t const * fib ); + +/* fd_fib4_free_cnt returns the number of fd_fib4_append calls that are + guaranteed to succeed, if fib is in PREPARE state. If fib is in ACTIVE + state returns 0. */ + +FD_FN_PURE ulong +fd_fib4_free_cnt( fd_fib4_t const * fib ); + +/* fd_fib4_append attempts to add a new route entry. Assumes the fib is in + PREPARE state. If fd_fib4_free_cnt(fib) returned non-zero immediately + prior to calling append, then append is guaranteed to succeed. + + Returns a hop object to be filled by the caller on success. On failure, + returns NULL and logs warning. Reasons for failure include no space + left or fib not in PREPARE state. */ + +fd_fib4_hop_t * +fd_fib4_append( fd_fib4_t * fib, + uint ip4_dst, + int prefix, + uint prio ); + +/* fd_fib4_publish transitions the fib from PREPARE to ACTIVE state. If + the fib is already ACTIVE does nothing. */ + +void +fd_fib4_publish( fd_fib4_t * fib ); + +/* Read APIs */ + +/* fd_fib4_lookup resolves the next hop for an arbitrary IPv4 address. + If route was not found, retval->rtype is set to FD_FIB4_RTYPE_THROW. + If fib is not in ACTIVE state, retval->rtype is set to + FD_FIB4_RTYPE_BLACKHOLE. + + Thread safe; Gracefully handles concurrent route updates by other + threads. */ + +fd_fib4_hop_t const * +fd_fib4_lookup( fd_fib4_t const * fib, + fd_fib4_hop_t * out, + uint ip4_dst, + ulong flags ); + +/* fd_fib4_hop_or is a helper to chain together multiple FIB lookups. */ + +FD_FN_PURE static inline fd_fib4_hop_t const * +fd_fib4_hop_or( fd_fib4_hop_t const * left, + fd_fib4_hop_t const * right ) { + return left->rtype!=FD_FIB4_RTYPE_THROW ? left : right; +} + +#if FD_HAS_HOSTED + +/* fd_fib4_fprintf prints the routing table to the given FILE * pointer (or + target equivalent). Order of routes is undefined but guaranteed to be + stable between calls. Outputs ASCII encoding with LF newlines. Returns + errno on failure and 0 on success. Only works on ACTIVE tables. */ + +int +fd_fib4_fprintf( fd_fib4_t const * fib, + void * file ); + +#endif + +FD_PROTOTYPES_END + +#endif /* HEADER_fd_src_waltz_ip_fd_fib4_h */ diff --git a/src/waltz/ip/fd_fib4_netlink.c b/src/waltz/ip/fd_fib4_netlink.c new file mode 100644 index 0000000000..40202191b5 --- /dev/null +++ b/src/waltz/ip/fd_fib4_netlink.c @@ -0,0 +1,264 @@ +#include "fd_fib4_netlink.h" +#include "fd_fib4.h" +#include "fd_netlink.h" + +#if !defined(__linux__) +#error "fd_fib4_netlink.c requires a Linux system with kernel headers" +#endif + +#include +#include +#include +#include +#include "../../util/fd_util.h" + +FD_STATIC_ASSERT( FD_FIB4_RTYPE_UNSPEC ==RTN_UNSPEC, linux ); +FD_STATIC_ASSERT( FD_FIB4_RTYPE_UNICAST ==RTN_UNICAST, linux ); +FD_STATIC_ASSERT( FD_FIB4_RTYPE_LOCAL ==RTN_LOCAL, linux ); +FD_STATIC_ASSERT( FD_FIB4_RTYPE_BROADCAST==RTN_BROADCAST, linux ); +FD_STATIC_ASSERT( FD_FIB4_RTYPE_MULTICAST==RTN_MULTICAST, linux ); +FD_STATIC_ASSERT( FD_FIB4_RTYPE_BLACKHOLE==RTN_BLACKHOLE, linux ); +FD_STATIC_ASSERT( FD_FIB4_RTYPE_THROW ==RTN_THROW, linux ); + +static void +fd_fib4_rta_gateway( fd_fib4_hop_t * hop, + void const * rta, + ulong rta_sz ) { + if( FD_UNLIKELY( rta_sz!=4UL ) ) { + FD_LOG_HEXDUMP_DEBUG(( "Failed to parse RTA_GATEWAY", rta, rta_sz )); + hop->flags |= FD_FIB4_FLAG_RTA_PARSE_ERR; + return; + } + uint ip_addr = FD_LOAD( uint, rta ); /* big endian */ + hop->ip4_gw = ip_addr; +} + +static void +fd_fib4_rta_oif( fd_fib4_hop_t * hop, + void const * rta, + ulong rta_sz ) { + if( FD_UNLIKELY( rta_sz!=4UL ) ) { + FD_LOG_HEXDUMP_DEBUG(( "Failed to parse RTA_OIF", rta, rta_sz )); + hop->flags |= FD_FIB4_FLAG_RTA_PARSE_ERR; + return; + } + hop->if_idx = FD_LOAD( uint, rta ); /* host byte order */ +} + +static void +fd_fib4_rta_prefsrc( fd_fib4_hop_t * hop, + void const * rta, + ulong rta_sz ) { + if( FD_UNLIKELY( rta_sz!=4UL ) ) { + FD_LOG_HEXDUMP_DEBUG(( "Failed to parse RTA_PREFSRC", rta, rta_sz )); + hop->flags |= FD_FIB4_FLAG_RTA_PARSE_ERR; + return; + } + hop->ip4_src = FD_LOAD( uint, rta ); /* big endian */ +} + +static int +fd_fib4_netlink_translate( fd_fib4_t * fib, + struct nlmsghdr const * msg_hdr, + uint table_id ) { + uint ip4_dst = 0U; + int prefix = -1; /* -1 indicates unset ip4_dst / prefix */ + uint prio = 0U; /* default metric */ + + fd_fib4_hop_t hop[1] = {0}; + + struct rtmsg * msg = NLMSG_DATA( msg_hdr ); + struct rtattr * rat = RTM_RTA( msg ); + long rat_sz = (long)(int)RTM_PAYLOAD( msg_hdr ); + + switch( msg->rtm_type ) { + case RTN_UNICAST: + hop->rtype = FD_FIB4_RTYPE_UNICAST; + break; + case RTN_LOCAL: + hop->rtype = FD_FIB4_RTYPE_LOCAL; + break; + case RTN_BROADCAST: + hop->rtype = FD_FIB4_RTYPE_BROADCAST; + break; + case RTN_MULTICAST: + hop->rtype = FD_FIB4_RTYPE_MULTICAST; + break; + case RTN_BLACKHOLE: + hop->rtype = FD_FIB4_RTYPE_BLACKHOLE; + break; + default: + FD_LOG_DEBUG(( "Unsupported route type (%u-%s)", msg->rtm_type, fd_netlink_rtm_type_str( msg->rtm_type ) )); + hop->rtype = FD_FIB4_RTYPE_BLACKHOLE; + hop->flags |= FD_FIB4_FLAG_RTYPE_UNSUPPORTED; + break; + } + + for( ; RTA_OK( rat, rat_sz ); rat=RTA_NEXT( rat, rat_sz ) ) { + void * rta = RTA_DATA( rat ); + ulong rta_sz = RTA_PAYLOAD( rat ); + + switch( rat->rta_type ) { + + case RTA_GATEWAY: + fd_fib4_rta_gateway( hop, rta, rta_sz ); + break; + + case RTA_DST: + if( FD_UNLIKELY( rta_sz!=4UL ) ) { + hop->flags |= FD_FIB4_FLAG_RTA_PARSE_ERR; + continue; + } + ip4_dst = FD_LOAD( uint, rta ); /* big endian */ + prefix = msg->rtm_dst_len; + break; + + case RTA_OIF: + fd_fib4_rta_oif( hop, rta, rta_sz ); + break; + + case RTA_PREFSRC: + fd_fib4_rta_prefsrc( hop, rta, rta_sz ); + break; + + case RTA_PRIORITY: + if( FD_UNLIKELY( rta_sz!=4UL ) ) { + hop->flags |= FD_FIB4_FLAG_RTA_PARSE_ERR; + continue; + } + prio = FD_LOAD( uint, rta ); /* host byte order */ + break; + + case RTA_TABLE: + /* Skip routes that aren't in the requested table */ + if( FD_UNLIKELY( rta_sz!=4UL ) ) { + hop->flags |= FD_FIB4_FLAG_RTA_PARSE_ERR; + continue; + } + if( FD_LOAD( uint, rta )!=table_id ) return 0; + break; + + default: + FD_LOG_DEBUG(( "Unsupported route table attribute (%u-%s)", rat->rta_type, fd_netlink_rtattr_str( rat->rta_type ) )); + hop->flags |= FD_FIB4_FLAG_RTA_UNSUPPORTED; + break; + } + } + + if( fd_fib4_free_cnt( fib )==0UL ) return ENOSPC; + *fd_fib4_append( fib, ip4_dst, prefix, prio ) = *hop; + + return 0; +} + +int +fd_fib4_netlink_load_table( fd_fib4_t * fib, + fd_netlink_t * netlink, + uint table_id ) { + + uint seq = netlink->seq++; + + struct { + struct nlmsghdr nlh; /* Netlink header */ + struct rtmsg rtm; /* Payload - route message */ + struct rtattr rta; + uint table_id; + } request; + request.nlh = (struct nlmsghdr) { + .nlmsg_type = RTM_GETROUTE, + .nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP, + .nlmsg_len = sizeof(request), + .nlmsg_seq = seq + }; + request.rtm = (struct rtmsg) { + .rtm_family = AF_INET, /* IPv4 */ + }; + request.rta = (struct rtattr) { + .rta_type = RTA_TABLE, + .rta_len = RTA_LENGTH( sizeof(uint) ) + }; + request.table_id = table_id; + + long send_res = sendto( netlink->fd, &request, sizeof(request), 0, NULL, 0 ); + if( FD_UNLIKELY( send_res<0 ) ) { + FD_LOG_WARNING(( "netlink send(%d,RTM_GETROUTE,NLM_F_REQUEST|NLM_F_DUMP) failed (%d-%s)", netlink->fd, errno, fd_io_strerror( errno ) )); + return errno; + } + if( FD_UNLIKELY( send_res!=sizeof(request) ) ) { + FD_LOG_WARNING(( "netlink send(%d,RTM_GETROUTE,NLM_F_REQUEST|NLM_F_DUMP) failed (short write)", netlink->fd )); + return EPIPE; + } + + fd_fib4_clear( fib ); + + int dump_intr = 0; + int no_space = 0; + ulong route_cnt = 0UL; + + uchar buf[ 4096 ]; + fd_netlink_iter_t iter[1]; + for( fd_netlink_iter_init( iter, netlink, buf, sizeof(buf) ); + !fd_netlink_iter_done( iter ); + fd_netlink_iter_next( iter, netlink ) ) { + struct nlmsghdr const * nlh = fd_netlink_iter_msg( iter ); + if( FD_UNLIKELY( nlh->nlmsg_flags & NLM_F_DUMP_INTR ) ) dump_intr = 1; + if( FD_UNLIKELY( nlh->nlmsg_type==NLMSG_ERROR ) ) { + struct nlmsgerr * err = NLMSG_DATA( nlh ); + int nl_err = -err->error; + FD_LOG_WARNING(( "netlink RTM_GETROUTE,NLM_F_REQUEST|NLM_F_DUMP failed (%d-%s)", nl_err, fd_io_strerror( nl_err ) )); + return nl_err; + } + if( FD_UNLIKELY( nlh->nlmsg_type!=RTM_NEWROUTE ) ) { + FD_LOG_DEBUG(( "unexpected nlmsg_type %u", nlh->nlmsg_type )); + continue; + } + route_cnt++; + + int translate_err = fd_fib4_netlink_translate( fib, nlh, table_id ); + if( FD_UNLIKELY( translate_err==ENOSPC ) ) { + no_space = 1; + break; + } + } + if( FD_UNLIKELY( iter->err > 0 ) ) return FD_FIB_NETLINK_ERR_IO; + ulong drain_cnt = fd_netlink_iter_drain( iter, netlink ); + + if( no_space ) { + FD_LOG_WARNING(( "Routing table is too small! `ip route show table %u` returned %lu entries, which exceeds the configured maximum of %lu", + table_id, route_cnt+drain_cnt, fd_fib4_max( fib ) )); + fd_fib4_clear( fib ); + return FD_FIB_NETLINK_ERR_SPACE; + } + + if( dump_intr ) { + FD_LOG_DEBUG(( "received NLM_F_DUMP_INTR (our read of the routing table was overrun by a concurrent write)" )); + return FD_FIB_NETLINK_ERR_INTR; + } + + if( FD_UNLIKELY( drain_cnt ) ) { + FD_LOG_WARNING(( "Unexpectedly skipped %lu routes. This is a bug!", drain_cnt )); + return FD_FIB_NETLINK_ERR_OOPS; + } + + fd_fib4_publish( fib ); + + return 0; +} + +FD_FN_CONST char const * +fd_fib4_netlink_strerror( int err ) { + switch( err ) { + case FD_FIB_NETLINK_SUCCESS: + return "success"; + case FD_FIB_NETLINK_ERR_OOPS: + return "oops"; + case FD_FIB_NETLINK_ERR_IO: + return "io"; + case FD_FIB_NETLINK_ERR_INTR: + return "interrupt"; + case FD_FIB_NETLINK_ERR_SPACE: + return "out of space"; + default: + return "unknown"; + } +} diff --git a/src/waltz/ip/fd_fib4_netlink.h b/src/waltz/ip/fd_fib4_netlink.h new file mode 100644 index 0000000000..cf0e2f169d --- /dev/null +++ b/src/waltz/ip/fd_fib4_netlink.h @@ -0,0 +1,52 @@ +/* fd_fib4_netlink.h provides APIs for importing routes from Linux netlink. */ + +#if defined(__linux__) + +#include "fd_fib4.h" +#include "fd_netlink1.h" + +/* FD_FIB_NETLINK_* gives error codes for netlink import operations. */ + +#define FD_FIB_NETLINK_SUCCESS (0) /* success */ +#define FD_FIB_NETLINK_ERR_OOPS (1) /* unexpected internal error */ +#define FD_FIB_NETLINK_ERR_IO (2) /* netlink I/O error */ +#define FD_FIB_NETLINK_ERR_INTR (3) /* netlink read was interrupted */ +#define FD_FIB_NETLINK_ERR_SPACE (4) /* fib is too small */ + +FD_PROTOTYPES_BEGIN + +/* fd_fib4_netlink_load_table mirrors a route table from netlink to fib. + The route table is requested via RTM_GETROUTE,NLM_F_REQUEST|NLM_F_DUMP. + table_id is in [0,2^31). table_id is typically RT_TABLE_LOCAL or + RT_TABLE_MAIN. These are 255 and 254 respectively on Linux. Assumes + netlink has a usable rtnetlink socket. fib is a writable join to a fib4 + object in PREPARE or ACTIVE state. Logs to debug level for diagnostics + and warning level in case of error. + + Returns FD_FIB4_NETLINK_SUCCESS on success and leaves fib in ACTIVE + state and netlink ready for the next request. fib is not guaranteed to + mirror the route table precisely even on success. (May turn routes with + unsupported type or attribute into blackhole routes.) + + On failure, leaves fib in PREPARE state (which blackholes all packets). + Return values FD_FIB4_NETLINK_ERR_{...} in case of error as follows: + + OOPS: Internal error (bug) occurred. + IO: Unrecoverable send/recv error or failed to parse MULTIPART msg. + INTR: Concurrent write overran read of the routing table. Try again. + SPACE: Routing table is too small to mirror the requested table. + + On return, the netlink socket is ready for the next request (even in + case of error) unless the error is FD_FIB_NETLINK_ERR_IO. */ + +int +fd_fib4_netlink_load_table( fd_fib4_t * fib, + fd_netlink_t * netlink, + uint table_id ); + +FD_FN_CONST char const * +fd_fib4_netlink_strerror( int err ); + +FD_PROTOTYPES_END + +#endif /* defined(__linux__) */ diff --git a/src/waltz/ip/fd_fib4_private.h b/src/waltz/ip/fd_fib4_private.h new file mode 100644 index 0000000000..cdfdbd52bd --- /dev/null +++ b/src/waltz/ip/fd_fib4_private.h @@ -0,0 +1,40 @@ +#ifndef HEADER_fd_src_waltz_route_fd_fib4_private_h +#define HEADER_fd_src_waltz_route_fd_fib4_private_h + +#include "fd_fib4.h" + +struct __attribute__((aligned(FD_FIB4_ALIGN))) fd_fib4_key { + /* FIXME optimize this to 8 bytes? */ + uint addr; /* prefix bits, little endian (low bits outside of mask are undefined) */ + uint mask; /* bit pattern */ + uint prio; /* lower is higher */ +}; + +typedef struct fd_fib4_key fd_fib4_key_t; + +struct __attribute__((aligned(FD_FIB4_ALIGN))) fd_fib4 { + ulong generation; + uint prepare_cnt; /* >0 implies PREPARE state, ==0 implies ACTIVE */ + uint active_cnt; + uint max; + ulong hop_off; + /* fd_fib4_key_t[] follows */ + /* fd_fib4_hop_t[] follows */ +}; + +FD_FN_CONST ulong +fd_fib4_key_tbl_laddr( fd_fib4_t const * fib ) { + return (ulong)fib + sizeof(fd_fib4_t); +} + +FD_FN_PURE ulong +fd_fib4_hop_tbl_laddr( fd_fib4_t const * fib ) { + return (ulong)fib + fib->hop_off; +} + +FD_FN_CONST static inline fd_fib4_key_t const * fd_fib4_key_tbl_const( fd_fib4_t const * fib ) { return (fd_fib4_key_t const *)fd_fib4_key_tbl_laddr( fib ); } +FD_FN_CONST static inline fd_fib4_key_t * fd_fib4_key_tbl ( fd_fib4_t * fib ) { return (fd_fib4_key_t *) fd_fib4_key_tbl_laddr( fib ); } +FD_FN_CONST static inline fd_fib4_hop_t const * fd_fib4_hop_tbl_const( fd_fib4_t const * fib ) { return (fd_fib4_hop_t const *)fd_fib4_hop_tbl_laddr( fib ); } +FD_FN_CONST static inline fd_fib4_hop_t * fd_fib4_hop_tbl ( fd_fib4_t * fib ) { return (fd_fib4_hop_t *) fd_fib4_hop_tbl_laddr( fib ); } + +#endif /* HEADER_fd_src_waltz_route_fd_fib4_private_h */ diff --git a/src/waltz/ip/fd_netlink1.c b/src/waltz/ip/fd_netlink1.c new file mode 100644 index 0000000000..6fc4871b80 --- /dev/null +++ b/src/waltz/ip/fd_netlink1.c @@ -0,0 +1,249 @@ +#include +#include +#include +#include +#include +#include + +#include "fd_netlink1.h" +#include "../../util/fd_util.h" + +FD_TL ulong fd_netlink_enobufs_cnt; + +static int +fd_nl_create_socket( void ) { + int fd = socket( AF_NETLINK, SOCK_RAW, NETLINK_ROUTE ); + + if( FD_UNLIKELY( fd<0 ) ) { + FD_LOG_WARNING(( "socket(AF_NETLINK,SOCK_RAW,NETLINK_ROUTE) failed (%i-%s)", + errno, fd_io_strerror( errno ) )); + return -1; + } + + int one = 1; + if( setsockopt( fd, SOL_NETLINK, NETLINK_EXT_ACK, &one, sizeof(one) )<0 ) { + FD_LOG_WARNING(( "setsockopt(sock,SOL_NETLINK,NETLINK_EXT_ACK) failed (%i-%s)", + errno, fd_io_strerror( errno ) )); + close( fd ); + return -1; + } + + return fd; +} + +static void +fd_nl_close_socket( int fd ) { + if( fd >= 0 ) { + close( fd ); + } +} + +long +fd_netlink_read_socket( int fd, + uchar * buf, + ulong buf_sz ) { + /* netlink is datagram based + once a recv succeeds, any un-received bytes are lost + and the next datagram will be properly aligned in the buffer */ + for(;;) { + long len = recvfrom( fd, buf, buf_sz, 0, NULL, NULL ); + if( FD_UNLIKELY( len<=0L ) ) { + if( len==0L ) continue; + if( errno==EINTR ) continue; + if( errno==ENOBUFS ) { + fd_netlink_enobufs_cnt++; + continue; + } + FD_LOG_WARNING(( "netlink recv failed (%d-%s)", errno, fd_io_strerror( errno ) )); + return -(long)errno; + } + return len; + } +} + +fd_netlink_t * +fd_netlink_init( fd_netlink_t * nl, + uint seq0 ) { + nl->fd = fd_nl_create_socket(); + if( FD_UNLIKELY( nl->fd<0 ) ) return NULL; + nl->seq = seq0; + return nl; +} + +void * +fd_netlink_fini( fd_netlink_t * nl ) { + fd_nl_close_socket( nl->fd ); + nl->fd = -1; + return nl; +} + +static void +fd_netlink_iter_recvmsg( fd_netlink_iter_t * iter, + fd_netlink_t * netlink ) { + long len = fd_netlink_read_socket( netlink->fd, iter->buf, iter->buf_sz ); + if( len<0L ) { + iter->err = (int)-len; + return; + } + iter->msg0 = iter->buf; + iter->msg1 = iter->buf+len; +} + +/* fd_netlink_iter_verify_next bounds checks the next message. If out-of- + bounds, logs warning and sets error EPROTO. This prevents the iterator + from returning an out-of-bounds netlink message. */ + +static void +fd_netlink_iter_bounds_check( fd_netlink_iter_t * iter ) { + if( fd_netlink_iter_done( iter ) ) return; + + struct nlmsghdr const * nlh = fd_type_pun_const( iter->msg0 ); + if( FD_UNLIKELY( iter->msg0 + sizeof(struct nlmsghdr) > iter->msg1 ) ) { + FD_LOG_WARNING(( "netlink message header out-of-bounds" )); + iter->err = EPROTO; + return; + } + if( FD_UNLIKELY( nlh->nlmsg_len < sizeof(struct nlmsghdr) ) ) { + /* prevent infinite loop */ + FD_LOG_WARNING(( "netlink message smaller than header" )); + iter->err = EPROTO; + return; + } + if( FD_UNLIKELY( iter->msg0 + nlh->nlmsg_len > iter->msg1 ) ) { + FD_LOG_WARNING(( "netlink message out-of-bounds: cur=[%p,%p) buf=[%p,%p)", + (void *)iter->msg0, (void *)iter->msg1, (void *)iter->buf, (void *)( iter->buf+iter->buf_sz ) )); + iter->err = EPROTO; + return; + } +} + +fd_netlink_iter_t * +fd_netlink_iter_init( fd_netlink_iter_t * iter, + fd_netlink_t * netlink, + uchar * buf, + ulong buf_sz ) { + *iter = (fd_netlink_iter_t) { + .buf = buf, + .buf_sz = buf_sz, + .msg0 = buf, + .msg1 = buf, + }; + + fd_netlink_iter_recvmsg( iter, netlink ); + fd_netlink_iter_bounds_check( iter ); + + return iter; +} + +int +fd_netlink_iter_done( fd_netlink_iter_t const * iter ) { + if( (iter->err!=0) | ( iter->msg1 - iter->msg0 < (long)sizeof(struct nlmsghdr) ) ) { + return 1; + } + struct nlmsghdr const * nlh = fd_type_pun_const( iter->msg0 ); + return nlh->nlmsg_type==NLMSG_DONE; +} + +fd_netlink_iter_t * +fd_netlink_iter_next( fd_netlink_iter_t * iter, + fd_netlink_t * netlink ) { + + if( fd_netlink_iter_done( iter ) ) return iter; + + struct nlmsghdr const * nlh = fd_type_pun_const( iter->msg0 ); + if( !(nlh->nlmsg_flags & NLM_F_MULTI) ) { + /* Last message was not a multipart message */ + iter->err = -1; /* eof */ + return iter; + } + iter->msg0 += NLMSG_ALIGN( nlh->nlmsg_len ); + + if( iter->msg0 >= iter->msg1 ) { + fd_netlink_iter_recvmsg( iter, netlink ); + } + fd_netlink_iter_bounds_check( iter ); + + return iter; +} + +char const * +fd_netlink_rtm_type_str( int rtm_type ) { + switch( rtm_type ) { + case RTN_UNSPEC: return "unspec"; + case RTN_UNICAST: return "unicast"; + case RTN_LOCAL: return "local"; + case RTN_BROADCAST: return "broadcast"; + case RTN_ANYCAST: return "anycast"; + case RTN_MULTICAST: return "multicast"; + case RTN_BLACKHOLE: return "blackhole"; + case RTN_UNREACHABLE: return "unreachable"; + case RTN_PROHIBIT: return "prohibit"; + case RTN_THROW: return "throw"; + case RTN_NAT: return "nat"; + case RTN_XRESOLVE: return "xresolve"; + default: return "unknown"; + } +} + +char const * +fd_netlink_rtattr_str( int rta_type ) { + switch( rta_type ) { + /* These exist since at least Linux v3.7 */ + case RTA_DST: return "dst"; + case RTA_SRC: return "src"; + case RTA_IIF: return "iif"; + case RTA_OIF: return "oif"; + case RTA_GATEWAY: return "gateway"; + case RTA_PRIORITY: return "priority"; + case RTA_PREFSRC: return "prefsrc"; + case RTA_METRICS: return "metrics"; + case RTA_MULTIPATH: return "multipath"; + case RTA_FLOW: return "flow"; + case RTA_CACHEINFO: return "cacheinfo"; + case RTA_TABLE: return "table"; + case RTA_MARK: return "mark"; +#ifdef RTA_MFC_STATS + case RTA_MFC_STATS: return "mfc_stats"; +#endif +#ifdef RTA_VIA + case RTA_VIA: return "via"; +#endif +#ifdef RTA_NEWDST + case RTA_NEWDST: return "newdst"; +#endif +#ifdef RTA_PREF + case RTA_PREF: return "pref"; +#endif +#ifdef RTA_ENCAP_TYPE + case RTA_ENCAP_TYPE: return "encap_type"; +#endif +#ifdef RTA_ENCAP + case RTA_ENCAP: return "encap"; +#endif +#ifdef RTA_EXPIRES + case RTA_EXPIRES: return "expires"; +#endif +#ifdef RTA_PAD + case RTA_PAD: return "pad"; +#endif +#ifdef RTA_UID + case RTA_UID: return "uid"; +#endif +#ifdef RTA_TTL_PROPAGATE + case RTA_TTL_PROPAGATE: return "ttl_propagate"; +#endif +#ifdef RTA_IP_PROTO + case RTA_IP_PROTO: return "ip_proto"; +#endif +#ifdef RTA_SPORT + case RTA_SPORT: return "sport"; +#endif +#ifdef RTA_DPORT + case RTA_DPORT: return "dport"; +#endif +#ifdef RTA_NH_ID + case RTA_NH_ID: return "nh_id"; +#endif + default: return "unknown"; + } +} diff --git a/src/waltz/ip/fd_netlink1.h b/src/waltz/ip/fd_netlink1.h new file mode 100644 index 0000000000..e43bad39c5 --- /dev/null +++ b/src/waltz/ip/fd_netlink1.h @@ -0,0 +1,107 @@ +#ifndef HEADER_fd_src_waltz_ip_fd_netlink_h +#define HEADER_fd_src_waltz_ip_fd_netlink_h + +#if defined(__linux__) + +#include "../../util/fd_util_base.h" + +struct fd_netlink { + int fd; /* netlink socket */ + uint seq; /* netlink sequence number */ +}; + +typedef struct fd_netlink fd_netlink_t; + +/* FIXME this should be a 'buffered reader' style API not an iterator since + iterators are infallible by definition in Firedancer style. */ + +struct fd_netlink_iter { + uchar * buf; + ulong buf_sz; + uchar * msg0; + uchar * msg1; + int err; +}; + +typedef struct fd_netlink_iter fd_netlink_iter_t; + +FD_PROTOTYPES_BEGIN + +/* fd_netlink_enobufs_cnt counts the number of ENOBUFS error occurrences. */ + +extern FD_TL ulong fd_netlink_enobufs_cnt; + +/* fd_netlink_init creates a new netlink session. Creates a new netlink + socket with explicit ACKs. seq0 is the initial sequence number. */ + +fd_netlink_t * +fd_netlink_init( fd_netlink_t * netlink, + uint seq0 ); + +/* fd_netlink_fini closes the netlink socket. */ + +void * +fd_netlink_fini( fd_netlink_t * netlink ); + +/* fd_netlink_read_socket wraps recvfrom(fd,buf,buf_sz,0,0,0) but + automatically skips EINTR and ENOBUFS errors. */ + +long +fd_netlink_read_socket( int fd, + uchar * buf, + ulong buf_sz ); + +/* fd_netlink_iter_init prepares iteration over a sequence of incoming + netlink multipart messages. */ + +fd_netlink_iter_t * +fd_netlink_iter_init( fd_netlink_iter_t * iter, + fd_netlink_t * netlink, + uchar * buf, + ulong buf_sz ); + +/* fd_netlink_iter_done returns 0 if there are more netlink messages to + iterate over or 1 if not. */ + +int +fd_netlink_iter_done( fd_netlink_iter_t const * iter ); + +/* fd_netlink_iter_next advances the iterator to the next netlink message + (if any). Assumes !fd_netlink_iter_done(iter). Invalidates pointers + previously returned by fd_netlink_iter_msg(iter). */ + +fd_netlink_iter_t * +fd_netlink_iter_next( fd_netlink_iter_t * iter, + fd_netlink_t * netlink ); + +/* fd_netlink_iter_msg returns a pointer to the current netlink message + header. Assumes !fd_netlink_iter_done(iter). */ + +static inline struct nlmsghdr const * +fd_netlink_iter_msg( fd_netlink_iter_t const * iter ) { + return fd_type_pun_const( iter->msg0 ); +} + +static FD_FN_UNUSED ulong +fd_netlink_iter_drain( fd_netlink_iter_t * iter, + fd_netlink_t * netlink ) { + ulong cnt; + for( cnt=0UL; !fd_netlink_iter_done( iter ); cnt++ ) { + fd_netlink_iter_next( iter, netlink ); + } + return cnt; +} + +/* Debug utils */ + +char const * +fd_netlink_rtm_type_str( int rtm_type ); + +char const * +fd_netlink_rtattr_str( int rta_type ); + +FD_PROTOTYPES_END + +#endif /* defined(__linux__) */ + +#endif /* HEADER_fd_src_waltz_ip_fd_netlink_h */ diff --git a/src/waltz/ip/test_fib4.c b/src/waltz/ip/test_fib4.c new file mode 100644 index 0000000000..a4cb776a34 --- /dev/null +++ b/src/waltz/ip/test_fib4.c @@ -0,0 +1,150 @@ +#define _POSIX_C_SOURCE 200809L /* fmemopen */ +#include "fd_fib4.h" +#include "../../util/fd_util.h" +#include "../../util/net/fd_ip4.h" + +static uchar __attribute__((aligned(FD_FIB4_ALIGN))) +fib1_mem[ 4096 ]; + +static uchar __attribute__((aligned(FD_FIB4_ALIGN))) +fib2_mem[ 4096 ]; + +#if FD_HAS_HOSTED +#include + +static void +test_fib_print( fd_fib4_t const * fib, + char const * actual ) { + static char dump_buf[ 8192 ]; + FILE * dump = fmemopen( dump_buf, sizeof(dump_buf), "w" ); + FD_TEST( 0==fd_fib4_fprintf( fib, dump ) ); + ulong sz = (ulong)ftell( dump ); + fclose( dump ); + + if( FD_UNLIKELY( 0!=strncmp( dump_buf, actual, sz ) ) ) { + fwrite( dump_buf, 1, sz, stderr ); + fflush( stderr ); + FD_LOG_ERR(( "FAIL: fd_fib4_fprintf(fib) != expected" )); + } +} + +#else /* !FD_HAS_HOSTED */ + +#define test_fib_print(...) + +#endif + +int +main( int argc, + char ** argv ) { + fd_boot( &argc, &argv ); + + FD_TEST( fd_ulong_is_aligned( (ulong)fib1_mem, fd_fib4_align() ) ); + FD_TEST( fd_fib4_footprint( 16 )<=sizeof(fib1_mem) ); + fd_fib4_t * fib_local = fd_fib4_join( fd_fib4_new( fib1_mem, 16 ) ); + fd_fib4_t * fib_main = fd_fib4_join( fd_fib4_new( fib2_mem, 16 ) ); + fd_fib4_hop_t candidate[2]; + + /* Ensure FIB in PREPARE state returns BLACKHOLE */ + + FD_TEST( fd_fib4_lookup( fib_local, candidate, 0x12345678, 0 )->rtype==FD_FIB4_RTYPE_BLACKHOLE ); + + /* Ensure empty FIB in ACTIVE returns THROW */ + + fd_fib4_publish( fib_local ); + FD_TEST( fd_fib4_lookup( fib_local, candidate, 0x12345678, 0 )->rtype==FD_FIB4_RTYPE_THROW ); + + /* Simple production scenario + + # ip route list table local + broadcast 192.0.2.160 dev bond0 proto kernel scope link src 192.0.2.165 + local 192.0.2.165 dev bond0 proto kernel scope host src 192.0.2.165 + broadcast 192.0.2.191 dev bond0 proto kernel scope link src 192.0.2.165 + broadcast 127.0.0.0 dev lo proto kernel scope link src 127.0.0.1 + local 127.0.0.0/8 dev lo proto kernel scope host src 127.0.0.1 + local 127.0.0.1 dev lo proto kernel scope host src 127.0.0.1 + broadcast 127.255.255.255 dev lo proto kernel scope link src 127.0.0.1 + + # ip route list table main + default via 192.0.2.161 dev bond0 proto dhcp src 192.0.2.165 metric 300 + 192.0.2.160/27 dev bond0 proto kernel scope link src 192.0.2.165 metric 300 */ + + fd_fib4_clear( fib_local ); + FD_TEST( fd_fib4_free_cnt( fib_local )>=7 ); + *fd_fib4_append( fib_local, FD_IP4_ADDR( 192,0,2,160 ), 32, 0 ) = (fd_fib4_hop_t){ .rtype=FD_FIB4_RTYPE_BROADCAST, .if_idx=6, .scope=253, .ip4_src=FD_IP4_ADDR( 192,0,2,165 ) }; + *fd_fib4_append( fib_local, FD_IP4_ADDR( 192,0,2,165 ), 32, 0 ) = (fd_fib4_hop_t){ .rtype=FD_FIB4_RTYPE_LOCAL, .if_idx=6, .scope=254, .ip4_src=FD_IP4_ADDR( 192,0,2,165 ) }; + *fd_fib4_append( fib_local, FD_IP4_ADDR( 192,0,2,191 ), 32, 0 ) = (fd_fib4_hop_t){ .rtype=FD_FIB4_RTYPE_BROADCAST, .if_idx=6, .scope=253, .ip4_src=FD_IP4_ADDR( 192,0,2,165 ) }; + *fd_fib4_append( fib_local, FD_IP4_ADDR( 127,0,0,0 ), 32, 0 ) = (fd_fib4_hop_t){ .rtype=FD_FIB4_RTYPE_BROADCAST, .if_idx=1, .scope=253, .ip4_src=FD_IP4_ADDR( 127,0,0,1 ) }; + *fd_fib4_append( fib_local, FD_IP4_ADDR( 127,0,0,0 ), 8, 0 ) = (fd_fib4_hop_t){ .rtype=FD_FIB4_RTYPE_LOCAL, .if_idx=1, .scope=254, .ip4_src=FD_IP4_ADDR( 127,0,0,1 ) }; + *fd_fib4_append( fib_local, FD_IP4_ADDR( 127,0,0,1 ), 32, 0 ) = (fd_fib4_hop_t){ .rtype=FD_FIB4_RTYPE_LOCAL, .if_idx=1, .scope=254, .ip4_src=FD_IP4_ADDR( 127,0,0,1 ) }; + *fd_fib4_append( fib_local, FD_IP4_ADDR( 127,0,255,255 ), 32, 0 ) = (fd_fib4_hop_t){ .rtype=FD_FIB4_RTYPE_BROADCAST, .if_idx=1, .scope=253, .ip4_src=FD_IP4_ADDR( 127,0,0,1 ) }; + fd_fib4_publish( fib_local ); + + test_fib_print( fib_local, + "throw default metric 4294967295\n" + "broadcast 192.0.2.160 dev 6 scope link src 192.0.2.165\n" + "local 192.0.2.165 dev 6 scope host src 192.0.2.165\n" + "broadcast 192.0.2.191 dev 6 scope link src 192.0.2.165\n" + "broadcast 127.0.0.0 dev 1 scope link src 127.0.0.1\n" + "local 127.0.0.0/8 dev 1 scope host src 127.0.0.1\n" + "local 127.0.0.1 dev 1 scope host src 127.0.0.1\n" + "broadcast 127.0.255.255 dev 1 scope link src 127.0.0.1\n" ); + + fd_fib4_clear( fib_main ); + FD_TEST( fd_fib4_free_cnt( fib_main )>=2 ); + *fd_fib4_append( fib_main, FD_IP4_ADDR( 0,0,0,0 ), 0, 300 ) = (fd_fib4_hop_t){ .rtype=FD_FIB4_RTYPE_UNICAST, .ip4_gw=FD_IP4_ADDR( 192,0,2,161 ), .if_idx=6, .ip4_src=FD_IP4_ADDR( 192,0,2,165 ) }; + *fd_fib4_append( fib_main, FD_IP4_ADDR( 192,0,2,161 ), 27, 300 ) = (fd_fib4_hop_t){ .rtype=FD_FIB4_RTYPE_UNICAST, .if_idx=6, .scope=253, .ip4_src=FD_IP4_ADDR( 192,0,2,165 ) }; + fd_fib4_publish( fib_main ); + + test_fib_print( fib_main, + "throw default metric 4294967295\n" + "default via 192.0.2.161 dev 6 src 192.0.2.165 metric 300\n" + "192.0.2.161/27 dev 6 scope link src 192.0.2.165 metric 300\n" ); + +# define QUERY(ip) fd_fib4_hop_or( fd_fib4_lookup( fib_local, candidate+0, FD_IP4_ADDR ip, 0 ), fd_fib4_lookup( fib_main, candidate+1, FD_IP4_ADDR ip, 0 ) ) + fd_fib4_hop_t const * next; + + /* $ ip route get 127.0.0.1 + local 127.0.0.1 dev lo src 127.0.0.1 */ + next = QUERY(( 127,0,0,1 )); + FD_TEST( next->rtype==FD_FIB4_RTYPE_LOCAL ); + FD_TEST( next->if_idx==1 ); + FD_TEST( next->ip4_src==FD_IP4_ADDR( 127,0,0,1 ) ); + + /* $ ip route get 192.0.2.160 + broadcast 192.0.2.160 dev bond0 src 192.0.2.165 */ + next = QUERY(( 192,0,2,160 )); + FD_TEST( next->rtype==FD_FIB4_RTYPE_BROADCAST ); + FD_TEST( next->if_idx==6 ); + FD_TEST( next->ip4_src==FD_IP4_ADDR( 192,0,2,165 ) ); + + /* $ ip route get 192.0.2.161 + 192.0.2.161 dev bond0 src 192.0.2.165 */ + next = QUERY(( 192,0,2,161 )); + FD_TEST( next->rtype==FD_FIB4_RTYPE_UNICAST ); + FD_TEST( next->if_idx==6 ); + FD_TEST( next->ip4_src==FD_IP4_ADDR( 192,0,2,165 ) ); + + /* $ ip route get 192.0.2.191 + broadcast 192.0.2.191 dev bond0 src 192.0.2.165 */ + next = QUERY(( 192,0,2,191 )); + FD_TEST( next->rtype==FD_FIB4_RTYPE_BROADCAST ); + FD_TEST( next->if_idx==6 ); + FD_TEST( next->ip4_src==FD_IP4_ADDR( 192,0,2,165 ) ); + + /* $ ip route get 8.8.8.8 + 8.8.8.8 via 192.0.2.161 dev bond0 src 192.0.2.165 */ + next = QUERY(( 8,8,8,8 )); + FD_TEST( next->rtype==FD_FIB4_RTYPE_UNICAST ); + FD_TEST( next->ip4_gw==FD_IP4_ADDR( 192,0,2,161 ) ); + FD_TEST( next->if_idx==6 ); + FD_TEST( next->ip4_src==FD_IP4_ADDR( 192,0,2,165 ) ); + +# undef QUERY + + fd_fib4_delete( fd_fib4_leave( fib_local ) ); + fd_fib4_delete( fd_fib4_leave( fib_main ) ); + + fd_halt(); + return 0; +} diff --git a/src/waltz/ip/test_fib4_netlink.c b/src/waltz/ip/test_fib4_netlink.c new file mode 100644 index 0000000000..91b3aa44d5 --- /dev/null +++ b/src/waltz/ip/test_fib4_netlink.c @@ -0,0 +1,53 @@ +#include +#include /* RT_TABLE_MAIN */ +#include "fd_fib4_netlink.h" +#include "../../util/fd_util.h" + +#define DEFAULT_FIB_SZ (1<<20) /* 1 MiB */ + +static uchar __attribute__((aligned(FD_FIB4_ALIGN))) +fib1_mem[ DEFAULT_FIB_SZ ]; + +/* Translate local and main tables and dump them to stdout */ + +void +dump_table( fd_netlink_t * netlink, + uint table ) { + ulong const route_max = 256UL; + FD_TEST( fd_fib4_footprint( route_max )<=sizeof(fib1_mem) ); + fd_fib4_t * fib = fd_fib4_join( fd_fib4_new( fib1_mem, route_max ) ); + + int load_err = fd_fib4_netlink_load_table( fib, netlink, table ); + if( FD_UNLIKELY( load_err ) ) { + FD_LOG_WARNING(( "Failed to load table %u (%i-%s)", table, load_err, fd_fib4_netlink_strerror( load_err ) )); + return; + } + + fprintf( stderr, "# ip route show table %u\n", table ); + fd_log_flush(); + fd_fib4_fprintf( fib, stderr ); + fputs( "\n", stderr ); + + fd_fib4_delete( fd_fib4_leave( fib ) ); +} + +int +main( int argc, + char ** argv ) { + fd_boot( &argc, &argv ); + + fd_netlink_t _netlink[1]; + fd_netlink_t * netlink = fd_netlink_init( _netlink, 42U ); + FD_TEST( netlink ); + + FD_LOG_NOTICE(( "Dumping local and main routing tables to stderr\n" )); + fd_log_flush(); + dump_table( netlink, RT_TABLE_LOCAL ); + dump_table( netlink, RT_TABLE_MAIN ); + fflush( stderr ); + + fd_netlink_fini( netlink ); + + fd_halt(); + return 0; +} diff --git a/src/waltz/mib/Local.mk b/src/waltz/mib/Local.mk new file mode 100644 index 0000000000..e8f1215d93 --- /dev/null +++ b/src/waltz/mib/Local.mk @@ -0,0 +1,9 @@ +$(call add-hdrs,fd_dbl_buf.h) +$(call add-objs,fd_dbl_buf,fd_waltz) +$(call add-hdrs,fd_netdev_tbl.h) +$(call add-objs,fd_netdev_tbl,fd_waltz) +ifdef FD_HAS_LINUX +$(call add-hdrs,fd_netdev_netlink.h) +$(call add-objs,fd_netdev_netlink,fd_waltz) +$(call make-unit-test,test_netdev_netlink,test_netdev_netlink,fd_waltz fd_util) +endif diff --git a/src/waltz/mib/fd_dbl_buf.c b/src/waltz/mib/fd_dbl_buf.c new file mode 100644 index 0000000000..36191ea230 --- /dev/null +++ b/src/waltz/mib/fd_dbl_buf.c @@ -0,0 +1,138 @@ +#include "fd_dbl_buf.h" +#include "../../util/log/fd_log.h" +#include "../../tango/fd_tango_base.h" + +#if FD_HAS_SSE +#include "../../util/simd/fd_sse.h" +#endif + +ulong +fd_dbl_buf_align( void ) { + return FD_DBL_BUF_ALIGN; +} + +ulong +fd_dbl_buf_footprint( ulong mtu ) { + return FD_DBL_BUF_FOOTPRINT( mtu ); +} + +void * +fd_dbl_buf_new( void * shmem, + ulong mtu, + ulong seq0 ) { + + if( FD_UNLIKELY( !shmem ) ) { + FD_LOG_WARNING(( "NULL shmem" )); + return NULL; + } + + if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)shmem, FD_DBL_BUF_ALIGN ) ) ) { + FD_LOG_WARNING(( "misaligned shmem" )); + return NULL; + } + + ulong mtu_align = fd_ulong_align_up( mtu, FD_DBL_BUF_ALIGN ); + FD_SCRATCH_ALLOC_INIT( l, shmem ); + fd_dbl_buf_t * dbl_buf = FD_SCRATCH_ALLOC_APPEND( l, FD_DBL_BUF_ALIGN, sizeof(fd_dbl_buf_t) ); + void * buf0 = FD_SCRATCH_ALLOC_APPEND( l, FD_DBL_BUF_ALIGN, mtu_align ); + void * buf1 = FD_SCRATCH_ALLOC_APPEND( l, FD_DBL_BUF_ALIGN, mtu_align ); + FD_SCRATCH_ALLOC_FINI( l, FD_DBL_BUF_ALIGN ); + + *dbl_buf = (fd_dbl_buf_t) { + .magic = 0UL, + .seq = seq0, + .sz = 0UL, + .mtu = mtu, + .buf0 = (ulong)buf0 - (ulong)dbl_buf, + .buf1 = (ulong)buf1 - (ulong)dbl_buf + }; + + FD_COMPILER_MFENCE(); + FD_VOLATILE( dbl_buf->magic ) = FD_DBL_BUF_MAGIC; + FD_COMPILER_MFENCE(); + + return dbl_buf; +} + +fd_dbl_buf_t * +fd_dbl_buf_join( void * shbuf ) { + + if( FD_UNLIKELY( !shbuf ) ) { + FD_LOG_WARNING(( "NULL shbuf" )); + return NULL; + } + + if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)shbuf, FD_DBL_BUF_ALIGN ) ) ) { + FD_LOG_WARNING(( "misaligned shbuf" )); + return NULL; + } + + fd_dbl_buf_t * dbl_buf = shbuf; + if( FD_UNLIKELY( dbl_buf->magic!=FD_DBL_BUF_MAGIC ) ) { + FD_LOG_WARNING(( "bad magic" )); + return NULL; + } + + return dbl_buf; +} + +void * +fd_dbl_buf_leave( fd_dbl_buf_t * buf ) { + return buf; +} + +void * +fd_dbl_buf_delete( void * shbuf ) { + + if( FD_UNLIKELY( !shbuf ) ) { + FD_LOG_WARNING(( "NULL shbuf" )); + return NULL; + } + + if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)shbuf, FD_DBL_BUF_ALIGN ) ) ) { + FD_LOG_WARNING(( "misaligned shbuf" )); + return NULL; + } + + fd_dbl_buf_t * dbl_buf = shbuf; + FD_COMPILER_MFENCE(); + FD_VOLATILE( dbl_buf->magic ) = 0UL; + FD_COMPILER_MFENCE(); + return dbl_buf; +} + +void +fd_dbl_buf_insert( fd_dbl_buf_t * buf, + void const * msg, + ulong sz ) { + /* */ sz = fd_ulong_min( sz, buf->mtu ); + ulong seq = fd_seq_inc( buf->seq, 1UL ); + void * dst = fd_dbl_buf_slot( buf, seq ); + + fd_memcpy( dst, msg, sz ); + +# if FD_HAS_SSE + FD_COMPILER_MFENCE(); + vv_t seq_sz = vv( seq, sz ); + _mm_store_si128( &buf->seq_sz, seq_sz ); + FD_COMPILER_MFENCE(); +# else + buf->sz = sz; + FD_COMPILER_MFENCE(); + buf->seq = seq; + FD_COMPILER_MFENCE(); +# endif +} + +ulong +fd_dbl_buf_read( fd_dbl_buf_t * buf, + void * obj, + ulong * opt_seqp ) { + ulong _seq[1]; + ulong * seqp = opt_seqp ? opt_seqp : _seq; + ulong sz; + do { + sz = fd_dbl_buf_try_read( buf, obj, seqp ); + } while( FD_UNLIKELY( sz==ULONG_MAX ) ); + return sz; +} diff --git a/src/waltz/mib/fd_dbl_buf.h b/src/waltz/mib/fd_dbl_buf.h new file mode 100644 index 0000000000..fbe39d9be9 --- /dev/null +++ b/src/waltz/mib/fd_dbl_buf.h @@ -0,0 +1,165 @@ +#ifndef HEADER_fd_src_waltz_mib_fd_dbl_buf_h +#define HEADER_fd_src_waltz_mib_fd_dbl_buf_h + +/* fd_dbl_buf.h provides a concurrent lock-free double buffer. A double + buffer contains two buffers that take turns holding a message for + consumers and receving a new message by a producer. + + Supports a single producer thread and an arbitrary number of consumer + threads. Optimized for rare updates and frequent polling (e.g. config). + Use an fd_tango mcache/dcache pair if you need frequent updates. + + Currently assumes a memory model that preserves store order across + threads (e.g. x86-TSO). Does not use atomics or hardware fences. */ + +#include "../../util/bits/fd_bits.h" +#if FD_HAS_SSE +#include +#endif + +/* FIXME COULD ALLOW FOR IN-PLACE READS WITH PODs BY ADDING A MSG ALIGN ARGUMENT */ + +/* fd_dbl_buf_t is the header of a dbl_buf object. May not be locally + declared. */ + +union __attribute__((aligned(16UL))) fd_dbl_buf { + + struct { + ulong magic; /* ==FD_DBL_BUF_MAGIC */ + ulong mtu; + ulong buf0; /* offset to first buffer from beginning of struct */ + ulong buf1; /* — " — second — " — */ + ulong seq; /* latest msg seq no */ + ulong sz; /* latest msg size */ + ulong pad[2]; + /* objects follow here */ + }; + +# if FD_HAS_SSE + struct { + __m128i magic_mtu; + __m128i buf0_buf1; + __m128i seq_sz; + __m128i pad2; + }; +# endif + +}; + +typedef union fd_dbl_buf fd_dbl_buf_t; + +#define FD_DBL_BUF_MAGIC (0xa6c6f85d431c03ceUL) /* random */ + +#define FD_DBL_BUF_ALIGN (16UL) +#define FD_DBL_BUF_FOOTPRINT(mtu) \ + FD_LAYOUT_FINI( FD_LAYOUT_APPEND( FD_LAYOUT_APPEND( FD_LAYOUT_INIT, \ + FD_DBL_BUF_ALIGN, sizeof(fd_dbl_buf_t) ), \ + FD_DBL_BUF_ALIGN, FD_ULONG_ALIGN_UP( mtu, FD_DBL_BUF_ALIGN )<<1UL ), \ + FD_DBL_BUF_ALIGN ) + +FD_PROTOTYPES_BEGIN + +/* fd_dbl_buf_{align,footprint} describe the memory region of a double + buffer. mtu is the largest possible message size. */ + +ulong +fd_dbl_buf_align( void ); + +ulong +fd_dbl_buf_footprint( ulong mtu ); + +/* fd_dbl_buf_new formats a memory region for use as a double buffer. + shmem points to the memory region matching fd_dbl_buf_{align,footprint}. + Initially, the active object of the double buffer will have sequence + number seq0 and zero byte size. */ + +void * +fd_dbl_buf_new( void * shmem, + ulong mtu, + ulong seq0 ); + +fd_dbl_buf_t * +fd_dbl_buf_join( void * shbuf ); + +void * +fd_dbl_buf_leave( fd_dbl_buf_t * buf ); + +/* fd_dbl_buf_delete unformats the memory region backing a dbl_buf and + releases ownership back to the caller. Returns shbuf. */ + +void * +fd_dbl_buf_delete( void * shbuf ); + +/* fd_dbl_buf_obj_mtu returns the max message size a dbl_buf can store. */ + +static inline ulong +fd_dbl_buf_obj_mtu( fd_dbl_buf_t * buf ) { + return buf->mtu; +} + +/* fd_dbl_buf_seq_query peeks the current sequence number. */ + +static inline ulong +fd_dbl_buf_seq_query( fd_dbl_buf_t * buf ) { + FD_COMPILER_MFENCE(); + ulong seq = FD_VOLATILE_CONST( buf->seq ); + FD_COMPILER_MFENCE(); + return seq; +} + +/* fd_dbl_buf_slot returns a pointer to the buffer for the given sequence + number. */ + +FD_FN_PURE static inline void * +fd_dbl_buf_slot( fd_dbl_buf_t * buf, + ulong seq ) { + return (seq&1) ? buf+buf->buf1 : buf+buf->buf0; +} + +/* fd_dbl_buf_insert appends a message to the double buffer. + + Note: It is NOT safe to call this function from multiple threads. */ + +void +fd_dbl_buf_insert( fd_dbl_buf_t * buf, + void const * msg, + ulong sz ); + +/* fd_dbl_buf_try_read does a speculative read the most recent message + (from the caller's POV). The read may be overrun by a writer. out + points to a buffer of fd_dbl_buf_obj_mtu(buf) bytes. opt_seqp points to + a ulong or NULL. + + On success: + - returns the size of the message read + - a copy of the message is stored at out + - *opt_seqp is set to the msg sequence number (if non-NULL) + + On failure (due to overrun): + - returns ULONG_MAX + - out buffer is clobbered + - *opt_seq is clobbered (if non-NULL) */ + +static inline ulong +fd_dbl_buf_try_read( fd_dbl_buf_t * buf, + void * out, + ulong * opt_seqp ) { + ulong seq = fd_dbl_buf_seq_query( buf ); + void * src = fd_dbl_buf_slot( buf, seq ); + ulong sz = FD_VOLATILE_CONST( buf->sz ); + fd_memcpy( out, src, sz ); + if( FD_UNLIKELY( seq!=fd_dbl_buf_seq_query( buf ) ) ) return ULONG_MAX; + fd_ulong_store_if( !!opt_seqp, opt_seqp, seq ); + return sz; +} + +/* fd_dbl_buf_read does a blocking */ + +ulong +fd_dbl_buf_read( fd_dbl_buf_t * buf, + void * obj, + ulong * opt_seqp ); + +FD_PROTOTYPES_END + +#endif /* HEADER_fd_src_waltz_mib_fd_dbl_buf_h */ diff --git a/src/waltz/mib/fd_netdev_netlink.c b/src/waltz/mib/fd_netdev_netlink.c new file mode 100644 index 0000000000..99ce900d13 --- /dev/null +++ b/src/waltz/mib/fd_netdev_netlink.c @@ -0,0 +1,221 @@ +#include "fd_netdev_netlink.h" +#include "../../util/fd_util.h" +#include "fd_netdev_tbl.h" + +#if !defined(__linux__) +#error "fd_fib4_netlink.c requires a Linux system with kernel headers" +#endif + +#include +#include /* IFNAMSIZ */ +#include /* ARPHRD_NETROM */ +#include /* RTM_{...}, NLM_{...} */ + +static fd_netdev_t * +fd_netdev_init( fd_netdev_t * netdev ) { + *netdev = (fd_netdev_t) { + .mtu = 1500, + .if_idx = 0, + .slave_tbl_idx = -1, + .master_idx = -1, + .oper_status = FD_OPER_STATUS_INVALID + }; + return netdev; +} + +FD_FN_CONST static uchar +ifoper_to_oper_status( uint if_oper ) { + /* Linux uses different enum values than RFC 2863 */ + switch( if_oper ) { + case IF_OPER_UNKNOWN: + return FD_OPER_STATUS_UNKNOWN; + case IF_OPER_NOTPRESENT: + return FD_OPER_STATUS_NOT_PRESENT; + case IF_OPER_DOWN: + return FD_OPER_STATUS_DOWN; + case IF_OPER_LOWERLAYERDOWN: + return FD_OPER_STATUS_LOWER_LAYER_DOWN; + case IF_OPER_TESTING: + return FD_OPER_STATUS_TESTING; + case IF_OPER_DORMANT: + return FD_OPER_STATUS_DORMANT; + case IF_OPER_UP: + return FD_OPER_STATUS_UP; + default: + return FD_OPER_STATUS_INVALID; + } +} + +int +fd_netdev_netlink_load_table( fd_netdev_tbl_join_t * tbl, + fd_netlink_t * netlink ) { + + fd_netdev_tbl_reset( tbl ); + + uint seq = netlink->seq++; + + struct { + struct nlmsghdr nlh; + struct ifinfomsg ifi; + } request; + request.nlh = (struct nlmsghdr) { + .nlmsg_type = RTM_GETLINK, + .nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP, + .nlmsg_len = sizeof(request), + .nlmsg_seq = seq + }; + request.ifi = (struct ifinfomsg) { + .ifi_family = AF_PACKET, + .ifi_type = ARPHRD_NETROM, + }; + + long send_res = sendto( netlink->fd, &request, sizeof(request), 0, NULL, 0 ); + if( FD_UNLIKELY( send_res<0 ) ) { + FD_LOG_WARNING(( "netlink send(%d,RTM_GETLINK,NLM_F_REQUEST|NLM_F_DUMP) failed (%i-%s)", netlink->fd, errno, fd_io_strerror( errno ) )); + return errno; + } + if( FD_UNLIKELY( send_res!=sizeof(request) ) ) { + FD_LOG_WARNING(( "netlink send(%d,RTM_GETLINK,NLM_F_REQUEST|NLM_F_DUMP) failed (short write)", netlink->fd )); + return EPIPE; + } + + int err = 0; + + uchar buf[ 4096 ]; + fd_netlink_iter_t iter[1]; + for( fd_netlink_iter_init( iter, netlink, buf, sizeof(buf) ); + !fd_netlink_iter_done( iter ); + fd_netlink_iter_next( iter, netlink ) ) { + struct nlmsghdr const * nlh = fd_netlink_iter_msg( iter ); + if( FD_UNLIKELY( nlh->nlmsg_type==NLMSG_ERROR ) ) { + struct nlmsgerr * err = NLMSG_DATA( nlh ); + int nl_err = -err->error; + FD_LOG_WARNING(( "netlink RTM_GETLINK,NLM_F_REQUEST|NLM_F_DUMP failed (%d-%s)", nl_err, fd_io_strerror( nl_err ) )); + return nl_err; + } + if( FD_UNLIKELY( nlh->nlmsg_type!=RTM_NEWLINK ) ) { + FD_LOG_DEBUG(( "unexpected nlmsg_type %u", nlh->nlmsg_type )); + continue; + } + struct ifinfomsg const * ifi = NLMSG_DATA( nlh ); + + if( FD_UNLIKELY( ifi->ifi_index<0 || ifi->ifi_index>=tbl->hdr->dev_max ) ) { + FD_LOG_WARNING(( "Error reading interface table: interface %d is beyond max of %u", ifi->ifi_index, tbl->hdr->dev_max )); + err = ENOSPC; + break; + } + if( ifi->ifi_type!=ARPHRD_ETHER && ifi->ifi_type!=ARPHRD_LOOPBACK ) continue; + + struct ifinfomsg * msg = NLMSG_DATA( nlh ); + struct rtattr * rat = (void *)( (ulong)msg + NLMSG_ALIGN( sizeof(struct ifinfomsg) ) ); + long rat_sz = (long)nlh->nlmsg_len - (long)NLMSG_ALIGN( sizeof(struct ifinfomsg) ); + + fd_netdev_t netdev[1]; + fd_netdev_init( netdev ); + + for( ; RTA_OK( rat, rat_sz ); rat=RTA_NEXT( rat, rat_sz ) ) { + void * rta = RTA_DATA( rat ); + ulong rta_sz = RTA_PAYLOAD( rat ); + + switch( rat->rta_type ) { + + case IFLA_IFNAME: + if( FD_UNLIKELY( rta_sz==0 || rta_sz>=IFNAMSIZ ) ) { + FD_LOG_WARNING(( "Error reading interface table: IFLA_IFNAME has unsupported size %lu", rta_sz )); + err = EPROTO; + goto fail; + } + memcpy( netdev->name, rta, rta_sz ); + netdev->name[ rta_sz ] = '\0'; + break; + + case IFLA_ADDRESS: + if( FD_UNLIKELY( rta_sz==6UL ) ) { + memcpy( netdev->mac_addr, rta, 6 ); + } + break; + + case IFLA_OPERSTATE: + if( FD_UNLIKELY( rta_sz!=1UL ) ) { + FD_LOG_WARNING(( "Error reading interface table: IFLA_OPERSTATE has unexpected size %lu", rta_sz )); + err = EPROTO; + goto fail; + } + netdev->oper_status = (uchar)ifoper_to_oper_status( FD_LOAD( uchar, rta ) ); + break; + + case IFLA_MTU: + if( FD_UNLIKELY( rta_sz!=4UL ) ) { + FD_LOG_WARNING(( "Error reading interface table: IFLA_MTU has unexpected size %lu", rta_sz )); + err = EPROTO; + goto fail; + } + netdev->mtu = (ushort)fd_uint_min( FD_LOAD( uint, rta ), USHORT_MAX ); + break; + + case IFLA_MASTER: { + if( FD_UNLIKELY( rta_sz!=4UL ) ) { + FD_LOG_WARNING(( "Error reading interface table: IFLA_MASTER has unexpected size %lu", rta_sz )); + err = EPROTO; + goto fail; + } + int master_idx = FD_LOAD( int, rta ); + if( FD_UNLIKELY( master_idx<0 || master_idx>=tbl->hdr->dev_max ) ) { + FD_LOG_WARNING(( "Error reading interface table: IFLA_MASTER has invalid index %d", master_idx )); + err = EPROTO; + goto fail; + } + netdev->master_idx = (short)master_idx; + break; + } + + } /* switch( rat->rta_type ) */ + } /* for each RTA */ + + if( ifi->ifi_type==ARPHRD_LOOPBACK ) { + netdev->oper_status = FD_OPER_STATUS_UP; + } + + tbl->dev_tbl[ ifi->ifi_index ] = *netdev; + tbl->hdr->dev_cnt = (ushort)fd_uint_max( tbl->hdr->dev_cnt, (uint)ifi->ifi_index+1U ); + } + + /* Walk the table again to index the bond master => slave mapping */ + + for( ulong j=0UL; j<(tbl->hdr->dev_cnt); j++ ) { + /* Only consider UP slaves */ + if( tbl->dev_tbl[ j ].oper_status!=FD_OPER_STATUS_UP ) continue; + + /* Find master */ + int master_idx = tbl->dev_tbl[ j ].master_idx; + if( master_idx<0 ) continue; + if( FD_UNLIKELY( master_idx>=tbl->hdr->dev_max ) ) continue; /* unreachable */ + fd_netdev_t * master = &tbl->dev_tbl[ master_idx ]; + + /* Allocate a new bond slave table if needed */ + if( master->slave_tbl_idx<0 ) { + if( FD_UNLIKELY( tbl->hdr->bond_cnt>=tbl->hdr->bond_max ) ) { + FD_LOG_WARNING(( "Error reading interface table: Found %u bond devices but max is %u", tbl->hdr->bond_cnt, tbl->hdr->bond_max )); + continue; + } + + master->slave_tbl_idx = (short)tbl->hdr->bond_cnt; + tbl->hdr->bond_cnt = (ushort)( tbl->hdr->bond_cnt+1U ); + /* Assume that this table is empty */ + } + + fd_netdev_bond_t * bond = &tbl->bond_tbl[ master->slave_tbl_idx ]; + if( FD_UNLIKELY( bond->slave_cnt>=FD_NETDEV_BOND_SLAVE_MAX ) ) { + FD_LOG_WARNING(( "Error reading interface table: Bond device %d has %u slaves but max is %d", master_idx, bond->slave_cnt, FD_NETDEV_BOND_SLAVE_MAX )); + continue; + } + bond->slave_idx[ bond->slave_cnt ] = (ushort)j; + bond->slave_cnt = (uchar)( bond->slave_cnt+1U ); + } + + return 0; + +fail: + fd_netlink_iter_drain( iter, netlink ); + return err; +} diff --git a/src/waltz/mib/fd_netdev_netlink.h b/src/waltz/mib/fd_netdev_netlink.h new file mode 100644 index 0000000000..67b0f61759 --- /dev/null +++ b/src/waltz/mib/fd_netdev_netlink.h @@ -0,0 +1,17 @@ +/* fd_netdev_netlink.h provides APIs for importing network interfaces from + Linux netlink. */ + +#if defined(__linux__) + +#include "fd_netdev_tbl.h" +#include "../ip/fd_netlink1.h" + +FD_PROTOTYPES_BEGIN + +int +fd_netdev_netlink_load_table( fd_netdev_tbl_join_t * tbl, + fd_netlink_t * netlink ); + +FD_PROTOTYPES_END + +#endif /* defined(__linux__) */ diff --git a/src/waltz/mib/fd_netdev_tbl.c b/src/waltz/mib/fd_netdev_tbl.c new file mode 100644 index 0000000000..f8619c4ecf --- /dev/null +++ b/src/waltz/mib/fd_netdev_tbl.c @@ -0,0 +1,192 @@ +#include "fd_netdev_tbl.h" +#include "../../util/fd_util.h" + +struct fd_netdev_tbl_private { + ulong magic; + ulong dev_off; + ulong bond_off; + fd_netdev_tbl_hdr_t hdr; +}; + +FD_FN_CONST ulong +fd_netdev_tbl_align( void ) { + return FD_NETDEV_TBL_ALIGN; +} + +ulong +fd_netdev_tbl_footprint( ulong dev_max, + ulong bond_max ) { + if( FD_UNLIKELY( dev_max ==0UL || dev_max >USHORT_MAX ) ) return 0UL; + if( FD_UNLIKELY( bond_max==0UL || bond_max>USHORT_MAX ) ) return 0UL; + return FD_LAYOUT_FINI( FD_LAYOUT_APPEND( FD_LAYOUT_APPEND( FD_LAYOUT_APPEND( FD_LAYOUT_INIT, \ + alignof(fd_netdev_tbl_t), sizeof(fd_netdev_tbl_t) ), \ + alignof(fd_netdev_t), sizeof(fd_netdev_t) * dev_max ), \ + alignof(fd_netdev_bond_t), sizeof(fd_netdev_bond_t) * bond_max ), \ + FD_NETDEV_TBL_ALIGN ); +} + +void * +fd_netdev_tbl_new( void * shmem, + ulong dev_max, + ulong bond_max ) { + + if( FD_UNLIKELY( !shmem ) ) { + FD_LOG_WARNING(( "NULL shmem" )); + return NULL; + } + + if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)shmem, FD_NETDEV_TBL_ALIGN ) ) ) { + FD_LOG_WARNING(( "misaligned shmem" )); + return NULL; + } + + if( FD_UNLIKELY( !dev_max || dev_max>USHORT_MAX ) ) { + FD_LOG_WARNING(( "invalid dev_max" )); + return NULL; + } + + if( FD_UNLIKELY( !bond_max || bond_max>USHORT_MAX ) ) { + FD_LOG_WARNING(( "invalid bond_max" )); + return NULL; + } + + FD_SCRATCH_ALLOC_INIT( l, shmem ); + fd_netdev_tbl_t * tbl = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_netdev_tbl_t), sizeof(fd_netdev_tbl_t) ); + fd_netdev_t * dev = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_netdev_t), sizeof(fd_netdev_t) * dev_max ); + fd_netdev_bond_t * bond = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_netdev_bond_t), sizeof(fd_netdev_bond_t) * bond_max ); + FD_SCRATCH_ALLOC_FINI( l, FD_NETDEV_TBL_ALIGN ); + + *tbl = (fd_netdev_tbl_t) { + .magic = FD_NETDEV_TBL_MAGIC, + .dev_off = (ulong)dev - (ulong)tbl, + .bond_off = (ulong)bond - (ulong)tbl, + .hdr = { + .dev_max = (ushort)dev_max, + .bond_max = (ushort)bond_max, + .dev_cnt = 0, + .bond_cnt = 0, + } + }; + + fd_netdev_tbl_join_t join[1]; + fd_netdev_tbl_join( join, shmem ); + fd_netdev_tbl_reset( join ); + fd_netdev_tbl_leave( join ); + + return tbl; +} + +fd_netdev_tbl_join_t * +fd_netdev_tbl_join( void * ljoin, + void * shtbl ) { + + if( FD_UNLIKELY( !shtbl ) ) { + FD_LOG_WARNING(( "NULL shtbl" )); + return NULL; + } + + fd_netdev_tbl_join_t * join = ljoin; + fd_netdev_tbl_t * tbl = shtbl; + + if( FD_UNLIKELY( tbl->magic!=FD_NETDEV_TBL_MAGIC ) ) { + FD_LOG_WARNING(( "bad magic" )); + return NULL; + } + + *join = (fd_netdev_tbl_join_t) { + .hdr = &tbl->hdr, + .dev_tbl = (fd_netdev_t *)( (ulong)tbl + tbl->dev_off ), + .bond_tbl = (fd_netdev_bond_t *)( (ulong)tbl + tbl->bond_off ), + }; + + return join; +} + +void * +fd_netdev_tbl_leave( fd_netdev_tbl_join_t * join ) { + return join; +} + +void * +fd_netdev_tbl_delete( void * shtbl ) { + + if( FD_UNLIKELY( !shtbl ) ) { + FD_LOG_WARNING(( "NULL shtbl" )); + return NULL; + } + + fd_netdev_tbl_t * tbl = shtbl; + tbl->magic = 0UL; + return tbl; +} + +void +fd_netdev_tbl_reset( fd_netdev_tbl_join_t * tbl ) { + tbl->hdr->dev_cnt = 0; + tbl->hdr->bond_cnt = 0; + for( ulong j=0UL; j<(tbl->hdr->dev_max); j++ ) { + tbl->dev_tbl[j] = (fd_netdev_t) { + .master_idx = -1, + .slave_tbl_idx = -1 + }; + } + fd_memset( tbl->bond_tbl, 0, sizeof(fd_netdev_bond_t) * tbl->hdr->bond_max ); +} + +#if FD_HAS_HOSTED + +#include +#include +#include "../../util/net/fd_eth.h" + +#define WRAP_PRINT(file,str) if( FD_UNLIKELY( fputs( (str), (file) )<0 ) ) return errno +#define WRAP_PRINTF(file,...) if( FD_UNLIKELY( fprintf( (file), __VA_ARGS__ )<0 ) ) return errno + +int +fd_netdev_tbl_fprintf( fd_netdev_tbl_join_t const * tbl, + void * file_ ) { + FILE * file = file_; + for( ulong j=0UL; j<(tbl->hdr->dev_cnt); j++ ) { + fd_netdev_t const * dev = &tbl->dev_tbl[j]; + if( !dev->oper_status ) continue; + WRAP_PRINTF( file, + "%lu: %s: mtu %u state (%i-%s)", + j, dev->name, dev->mtu, + dev->oper_status, fd_oper_status_cstr( dev->oper_status ) ); + if( dev->slave_tbl_idx>=0 ) { + WRAP_PRINT( file, " master" ); + } + WRAP_PRINTF( file, + "\n link " FD_ETH_MAC_FMT "\n", + FD_ETH_MAC_FMT_ARGS( dev->mac_addr ) ); + if( dev->slave_tbl_idx>=0 && tbl->bond_tbl[ dev->slave_tbl_idx ].slave_cnt ) { + fd_netdev_bond_t * bond = &tbl->bond_tbl[ dev->slave_tbl_idx ]; + WRAP_PRINTF( file, " slaves (%u):", bond->slave_cnt ); + for( ulong k=0UL; k<(bond->slave_cnt); k++ ) { + WRAP_PRINTF( file, " %u-%s", bond->slave_idx[k], tbl->dev_tbl[ bond->slave_idx[k] ].name ); + } + WRAP_PRINT( file, "\n" ); + } + } + return 0; +} + +#undef WRAP_PRINT +#undef WRAP_PRINTF + +#endif /* FD_HAS_HOSTED */ + +char const * +fd_oper_status_cstr( uint oper_status ) { + switch( oper_status ) { + case FD_OPER_STATUS_UP: return "up"; + case FD_OPER_STATUS_DOWN: return "down"; + case FD_OPER_STATUS_TESTING: return "testing"; + case FD_OPER_STATUS_DORMANT: return "dormant"; + case FD_OPER_STATUS_NOT_PRESENT: return "not present"; + case FD_OPER_STATUS_LOWER_LAYER_DOWN: return "lower layer down"; + case FD_OPER_STATUS_UNKNOWN: /* fallthrough */ + default: + return "unknown"; + } +} diff --git a/src/waltz/mib/fd_netdev_tbl.h b/src/waltz/mib/fd_netdev_tbl.h new file mode 100644 index 0000000000..6b36f4ef42 --- /dev/null +++ b/src/waltz/mib/fd_netdev_tbl.h @@ -0,0 +1,147 @@ +#ifndef HEADER_fd_src_waltz_mib_fd_netdev_h +#define HEADER_fd_src_waltz_mib_fd_netdev_h + +/* fd_netdev_tbl.h provides a network interface table. + The entrypoint of this API is fd_netlink_tbl_t. */ + +#include "../../util/fd_util_base.h" + +/* FD_OPER_STATUS_* give the operational state of a network interface. + See RFC 2863 Section 3.1.14: https://datatracker.ietf.org/doc/html/rfc2863#section-3.1.14 */ + +#define FD_OPER_STATUS_INVALID (0) +#define FD_OPER_STATUS_UP (1) /* ready to pass packets */ +#define FD_OPER_STATUS_DOWN (2) +#define FD_OPER_STATUS_TESTING (3) /* in some test mode */ +#define FD_OPER_STATUS_UNKNOWN (4) /* status can not be determined */ +#define FD_OPER_STATUS_DORMANT (5) +#define FD_OPER_STATUS_NOT_PRESENT (6) /* some component is missing */ +#define FD_OPER_STATUS_LOWER_LAYER_DOWN (7) /* down due to state of lower-layer interface(s) */ + +/* fd_netdev_t holds basic configuration of a network device. */ + +struct fd_netdev { + ushort mtu; /* Largest layer-3 payload that fits in a packet */ + uchar mac_addr[6]; /* MAC address */ + ushort if_idx; /* Interface index */ + short slave_tbl_idx; /* index to bond slave table, -1 if not a bond master */ + short master_idx; /* index of bond master, -1 if not a bond slave */ + char name[16]; /* cstr interface name (max 15 length) */ + uchar oper_status; /* one of FD_OPER_STATUS_{...} */ + uchar pad[1]; + /* padded to 32 bytes */ +}; + +typedef struct fd_netdev fd_netdev_t; + +/* FD_NETDEV_BOND_SLAVE_MAX is the max supported number of bond slaves. */ + +#define FD_NETDEV_BOND_SLAVE_MAX (16) + +/* fd_netdev_bond_t lists active slaves of a bond device. */ + +struct fd_netdev_bond { + uchar slave_cnt; + ushort slave_idx[ FD_NETDEV_BOND_SLAVE_MAX ]; +}; + +typedef struct fd_netdev_bond fd_netdev_bond_t; + +/* fd_netdev_tbl_t provides an interface table. + + This table is optimized for frequent reads and rare writes. It is + generally not thread-safe to modify the table in-place. The only safe + way to sync modifications to other threads is by copying the table in + its entirety. */ + +struct fd_netdev_tbl_private; +typedef struct fd_netdev_tbl_private fd_netdev_tbl_t; + +struct fd_netdev_tbl_hdr { + ushort dev_max; + ushort bond_max; + ushort dev_cnt; + ushort bond_cnt; +}; +typedef struct fd_netdev_tbl_hdr fd_netdev_tbl_hdr_t; + +struct fd_netdev_tbl_join { + fd_netdev_tbl_hdr_t * hdr; + fd_netdev_t * dev_tbl; + fd_netdev_bond_t * bond_tbl; +}; +typedef struct fd_netdev_tbl_join fd_netdev_tbl_join_t; + +#define FD_NETDEV_TBL_MAGIC (0xd5f9ba2710d6bf0aUL) /* random */ + +/* FD_NETDEV_TBL_ALIGN is the return value of fd_netdev_tbl_align() */ + +#define FD_NETDEV_TBL_ALIGN (16UL) + +FD_PROTOTYPES_BEGIN + +/* fd_netdev_tbl_{align,footprint} describe a memory region suitable to + back a netdev_tbl with dev_max interfaces and bond_max bond masters. */ + +FD_FN_CONST ulong +fd_netdev_tbl_align( void ); + +ulong +fd_netdev_tbl_footprint( ulong dev_max, + ulong bond_max ); + +/* fd_netdev_tbl_new formats a memory region as an empty netdev_tbl. + Returns shmem on success. On failure returns NULL and logs reason for + failure. */ + +void * +fd_netdev_tbl_new( void * shmem, + ulong dev_max, + ulong bond_max ); + +/* fd_netdev_tbl_join joins a netdev_tbl at shtbl. ljoin points to a + fd_netdev_tbl_join_t[1] to which object information is written to. + Returns ljoin on success. On failure, returns NULL and logs reason for + failure. */ + +fd_netdev_tbl_join_t * +fd_netdev_tbl_join( void * ljoin, + void * shtbl ); + +/* fd_netdev_tbl_leave undoes a fd_netdev_tbl_join. Returns ownership + of the region backing join to the caller. (Warning: This returns ljoin, + not shtbl) */ + +void * +fd_netdev_tbl_leave( fd_netdev_tbl_join_t * join ); + +/* fd_netdev_tbl_delete unformats the memory region backing a netdev_tbl + and returns ownership of the region back to the caller. */ + +void * +fd_netdev_tbl_delete( void * shtbl ); + +/* fd_netdev_tbl_reset resets the table to the state of a newly constructed + empty object (clears all devices and bonds). */ + +void +fd_netdev_tbl_reset( fd_netdev_tbl_join_t * tbl ); + +#if FD_HAS_HOSTED + +/* fd_netdev_tbl_fprintf prints the interface table to the given FILE * + pointer (or target equivalent). Outputs ASCII encoding with LF + newlines. Returns errno on failure and 0 on success. */ + +int +fd_netdev_tbl_fprintf( fd_netdev_tbl_join_t const * tbl, + void * file ); + +#endif /* FD_HAS_HOSTED */ + +FD_PROTOTYPES_END + +char const * +fd_oper_status_cstr( uint oper_status ); + +#endif /* HEADER_fd_src_waltz_mib_fd_netdev_h */ diff --git a/src/waltz/mib/test_netdev_netlink.c b/src/waltz/mib/test_netdev_netlink.c new file mode 100644 index 0000000000..e25363592b --- /dev/null +++ b/src/waltz/mib/test_netdev_netlink.c @@ -0,0 +1,61 @@ +#include +#include "fd_netdev_netlink.h" +#include "../../util/fd_util.h" + +int +main( int argc, + char ** argv ) { + fd_boot( &argc, &argv ); + + ulong cpu_idx = fd_tile_cpu_id( fd_tile_idx() ); + if( cpu_idx>fd_shmem_cpu_cnt() ) cpu_idx = 0UL; + + char const * _page_sz = fd_env_strip_cmdline_cstr ( &argc, &argv, "--page-sz", NULL, "normal" ); + ulong page_cnt = fd_env_strip_cmdline_ulong( &argc, &argv, "--page-cnt", NULL, 4096UL ); + ulong numa_idx = fd_env_strip_cmdline_ulong( &argc, &argv, "--numa-idx", NULL, fd_shmem_numa_idx( cpu_idx ) ); + ulong dev_cnt = fd_env_strip_cmdline_ulong( &argc, &argv, "--dev-cnt", NULL, 256UL ); + ulong bond_cnt = fd_env_strip_cmdline_ulong( &argc, &argv, "--bond-cnt", NULL, 4UL ); + + ulong page_sz = fd_cstr_to_shmem_page_sz( _page_sz ); + if( FD_UNLIKELY( !page_sz ) ) FD_LOG_ERR(( "unsupported --page-sz" )); + + if( FD_UNLIKELY( !dev_cnt ) ) FD_LOG_ERR(( "unsupported --dev-cnt" )); + if( FD_UNLIKELY( !bond_cnt ) ) FD_LOG_ERR(( "unsupported --bond-cnt" )); + + FD_LOG_NOTICE(( "Creating workspace (--page-cnt %lu, --page-sz %s, --numa-idx %lu)", page_cnt, _page_sz, numa_idx )); + fd_wksp_t * wksp = fd_wksp_new_anonymous( fd_cstr_to_shmem_page_sz( _page_sz ), page_cnt, fd_shmem_cpu_idx( numa_idx ), "wksp", 0UL ); + FD_TEST( wksp ); + + ulong tbl_fp = fd_netdev_tbl_footprint( dev_cnt, bond_cnt ); + if( FD_UNLIKELY( !tbl_fp ) ) { + FD_LOG_ERR(( "Invalid --dev-cnt or --page-cnt" )); + } + void * tbl_mem = fd_wksp_alloc_laddr( wksp, fd_netdev_tbl_align(), tbl_fp, 1UL ); + FD_TEST( tbl_mem ); + + FD_TEST( fd_netdev_tbl_new( tbl_mem, dev_cnt, bond_cnt )==tbl_mem ); + fd_netdev_tbl_join_t tbl[1]; + FD_TEST( fd_netdev_tbl_join( tbl, tbl_mem )==tbl ); + + fd_netlink_t _netlink[1]; + fd_netlink_t * netlink = fd_netlink_init( _netlink, 42U ); + FD_TEST( netlink ); + + int ld_err = fd_netdev_netlink_load_table( tbl, netlink ); + if( FD_UNLIKELY( ld_err ) ) { + FD_LOG_WARNING(( "Failed to load interfaces (error code %i)", ld_err )); + } + FD_LOG_NOTICE(( "Dumping interface table" )); + fd_log_flush(); + fd_netdev_tbl_fprintf( tbl, stderr ); + fflush( stderr ); + + fd_netlink_fini( netlink ); + fd_netdev_tbl_leave( tbl ); + fd_wksp_free_laddr( fd_netdev_tbl_delete( tbl_mem ) ); + fd_wksp_delete_anonymous( wksp ); + + FD_LOG_NOTICE(( "pass" )); + fd_halt(); + return 0; +} diff --git a/src/waltz/neigh/Local.mk b/src/waltz/neigh/Local.mk new file mode 100644 index 0000000000..0cba1dca5f --- /dev/null +++ b/src/waltz/neigh/Local.mk @@ -0,0 +1,9 @@ +$(call add-hdrs,fd_neigh4_map.h fd_neigh4_map_defines.h) +$(call add-objs,fd_neigh4_map,fd_waltz) +ifdef FD_HAS_LINUX +ifdef FD_HAS_SSE +$(call add-hdrs,fd_neigh4_netlink.h) +$(call add-objs,fd_neigh4_netlink,fd_waltz) +$(call make-unit-test,test_neigh4_netlink,test_neigh4_netlink,fd_waltz fd_util) +endif +endif diff --git a/src/waltz/neigh/fd_neigh4_map.c b/src/waltz/neigh/fd_neigh4_map.c new file mode 100644 index 0000000000..7631b1dfac --- /dev/null +++ b/src/waltz/neigh/fd_neigh4_map.c @@ -0,0 +1,41 @@ +/* Include fd_neigh4_map prototypes */ +#include "fd_neigh4_map.h" + +/* Generate fd_neigh4_map definitions */ +#include "fd_neigh4_map_defines.h" +#define MAP_IMPL_STYLE 2 +#include "../../util/tmpl/fd_map_slot_para.c" + +#if FD_HAS_HOSTED && FD_HAS_SSE + +#include +#include +#include +#include "../../util/net/fd_ip4.h" +#include "../../util/net/fd_eth.h" + +int +fd_neigh4_hmap_fprintf( fd_neigh4_hmap_t const * map, + void * file_ ) { + FILE * file = file_; + + ulong ele_max = fd_neigh4_hmap_ele_max( map ); + fd_neigh4_entry_t const * ele = fd_neigh4_hmap_shele_const( map ); + + for( ulong j=0UL; j +#include /* AF_INET */ +#include /* struct nlmsghdr */ +#include /* RTM_NEWNEIGH */ +#include /* struct ndmsg */ +#include "../ip/fd_netlink1.h" +#include "../../util/fd_util.h" +#include "../../util/net/fd_ip4.h" +#include "fd_neigh4_map.h" + +int +fd_neigh4_netlink_request_dump( fd_netlink_t * netlink, + uint if_idx ) { + + uint seq = netlink->seq++; + + struct { + struct nlmsghdr nlh; + struct ndmsg ndm; + } request; + request.nlh = (struct nlmsghdr) { + .nlmsg_type = RTM_GETNEIGH, + .nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP, + .nlmsg_len = sizeof(request), + .nlmsg_seq = seq + }; + request.ndm = (struct ndmsg) { + .ndm_family = AF_INET, + .ndm_ifindex = (int)if_idx + }; + + long send_res = send( netlink->fd, &request, sizeof(request), 0 ); + if( FD_UNLIKELY( send_res<0 ) ) { + FD_LOG_WARNING(( "netlink send(RTM_GETNEIGH,NLM_F_REQUEST|NLM_F_DUMP) failed (%d-%s)", errno, fd_io_strerror( errno ) )); + return errno; + } + if( FD_UNLIKELY( send_res!=sizeof(request ) ) ) { + FD_LOG_WARNING(( "netlink send(RTM_GETROUTE,NLM_F_REQUEST|NLM_F_DUMP) failed (short write)" )); + return EPIPE; + } + + return 0; +} + +void +fd_neigh4_netlink_ingest_message( fd_neigh4_hmap_t * map, + struct nlmsghdr const * msg_hdr, + uint if_idx ) { + if( FD_UNLIKELY( msg_hdr->nlmsg_type!=RTM_NEWNEIGH && msg_hdr->nlmsg_type!=RTM_DELNEIGH ) ) { + FD_LOG_WARNING(( "unexpected nlmsg_type %u", msg_hdr->nlmsg_type )); + } + + struct ndmsg const * ndm = NLMSG_DATA( msg_hdr ); + struct rtattr const * rat = RTM_RTA( ndm ); + long rat_sz = (long)(int)RTM_PAYLOAD( msg_hdr ); + + if( FD_UNLIKELY( ndm->ndm_family!=AF_INET ) ) return; + if( FD_UNLIKELY( (uint)ndm->ndm_ifindex!=if_idx ) ) return; + + uint ip4_dst = 0U; + union { + uchar u6[6]; + ulong ul; + } mac_addr = {0}; + + for( ; RTA_OK( rat, rat_sz ); rat=RTA_NEXT( rat, rat_sz ) ) { + + void * rta = RTA_DATA( rat ); + ulong rta_sz = RTA_PAYLOAD( rat ); + + switch( rat->rta_type ) { + + case NDA_DST: + if( FD_UNLIKELY( rta_sz!=4UL ) ) { + FD_LOG_WARNING(( "unexpected NDA_DST size %lu", rta_sz )); + return; + } + ip4_dst = FD_LOAD( uint, rta ); /* big endian */ + break; + + case NDA_LLADDR: + if( FD_UNLIKELY( rta_sz!=6UL ) ) { + FD_LOG_WARNING(( "unexpected NDA_LLADDR size %lu (is this an Ethernet interface?)", rta_sz )); + return; + } + memcpy( mac_addr.u6, rta, 6 ); + break; + + default: + break; /* ignore */ + } + + } + + if( FD_UNLIKELY( !mac_addr.ul || !ip4_dst ) ) { + FD_LOG_DEBUG(( "Ignoring neighbor table update with missing or invalid L2 or L3 address" )); + return; + } + + /* Determine if we should remove or insert/update entry */ + + int remove = 0; + switch( ndm->ndm_state ) { + case NUD_REACHABLE: + case NUD_STALE: + case NUD_DELAY: + case NUD_PROBE: + case NUD_PERMANENT: + remove = 0; + break; + default: + remove = 1; + break; + } + if( msg_hdr->nlmsg_type==RTM_DELNEIGH ) { + remove = 1; + } + + /* Perform update */ + + if( remove ) { + + fd_neigh4_hmap_remove( map, &ip4_dst, NULL, FD_MAP_FLAG_BLOCKING ); + + } else { + + fd_neigh4_hmap_query_t query[1]; + int prepare_res = fd_neigh4_hmap_prepare( map, &ip4_dst, NULL, query, FD_MAP_FLAG_BLOCKING ); + if( FD_UNLIKELY( prepare_res!=FD_MAP_SUCCESS ) ) { + FD_LOG_WARNING(( "Failed to update neighbor table" )); + return; + } + + fd_neigh4_entry_t * ele = fd_neigh4_hmap_query_ele( query ); + + ele->ip4_addr = ip4_dst; + memcpy( ele->mac_addr, mac_addr.u6, 6 ); + + fd_neigh4_hmap_publish( query ); + + } + +} + +int +fd_neigh4_netlink_solicit( fd_netlink_t * netlink, + uint if_idx, + uint ip4_addr ) { + + uint seq = netlink->seq++; + + struct { + struct nlmsghdr nlh; + struct ndmsg ndm; + struct nlattr nla_dst; + uint dst_addr; + } request; + request.nlh = (struct nlmsghdr) { + .nlmsg_type = RTM_NEWNEIGH, + .nlmsg_flags = NLM_F_REQUEST|NLM_F_ACK|NLM_F_EXCL|NLM_F_CREATE, + .nlmsg_seq = seq + }; + request.ndm = (struct ndmsg) { + .ndm_family = AF_INET, + .ndm_ifindex = (int)if_idx, + .ndm_state = NUD_INCOMPLETE, /* neighbor entry starts out as empty */ + .ndm_flags = NTF_USE /* mark neighbor as used which triggers ARP request */ + }; + request.nla_dst = (struct nlattr) { + .nla_type = NDA_DST, + .nla_len = (ushort)( sizeof(struct nlattr) + fd_uint_align_up( sizeof(uint), NLA_ALIGNTO ) ) + }; + request.dst_addr = ip4_addr; /* big endian */ + + /* Send request */ + + long send_res = sendto( netlink->fd, &request, sizeof(request), 0, NULL, 0 ); + if( FD_UNLIKELY( send_res<0 ) ) { + FD_LOG_WARNING(( "netlink send(RTM_NEWNEIGH,NLM_F_REQUEST|NLM_F_ACK|NLM_F_EXCL|NLM_F_CREATE," FD_IP4_ADDR_FMT ") failed (%d-%s)", + FD_IP4_ADDR_FMT_ARGS( ip4_addr ), errno, fd_io_strerror( errno ) )); + return errno; + } + if( FD_UNLIKELY( send_res!=sizeof(request) ) ) { + FD_LOG_WARNING(( "netlink send(RTM_NEWNEIGH,NLM_F_REQUEST|NLM_F_ACK|NLM_F_EXCL|NLM_F_CREATE," FD_IP4_ADDR_FMT ") failed (short write)", + FD_IP4_ADDR_FMT_ARGS( ip4_addr ) )); + return EPIPE; + } + + /* Get error code */ + + for( ulong attempt=0UL; attempt<64UL; attempt++ ) { + uchar buf[ 4096 ]; + long recv_res = fd_netlink_read_socket( netlink->fd, buf, sizeof(buf) ); + if( FD_UNLIKELY( recv_res<0 ) ) { + FD_LOG_WARNING(( "netlink recv failed (%d-%s)", errno, fd_io_strerror( errno ) )); + return errno; + } + + struct nlmsghdr const * nlh = fd_type_pun_const( buf ); + if( FD_UNLIKELY( nlh->nlmsg_seq != seq ) ) { + /* Should only happen if caller misbehaves */ + FD_LOG_WARNING(( "Dropping rtnetlink message type=%u seq=%u", nlh->nlmsg_type, nlh->nlmsg_seq )); + continue; + } + + if( FD_UNLIKELY( nlh->nlmsg_type!=NLMSG_ERROR ) ) { + /* Should never happen */ + FD_LOG_WARNING(( "unexpected nlmsg_type %u for RTM_NEWNEIGH request", nlh->nlmsg_type )); + continue; + } + + struct nlmsgerr * err = NLMSG_DATA( nlh ); + int nl_err = -err->error; + return nl_err; + } + + FD_LOG_WARNING(( "Giving up on receiving response code for RTM_NEWNEIGH request" )); + return 0; +} diff --git a/src/waltz/neigh/fd_neigh4_netlink.h b/src/waltz/neigh/fd_neigh4_netlink.h new file mode 100644 index 0000000000..f7f136e17e --- /dev/null +++ b/src/waltz/neigh/fd_neigh4_netlink.h @@ -0,0 +1,50 @@ +/* fd_neigh4_netlink.h provides APIs for importing IPv4 neighbors from + Linux netlink. Assumes link-layer addresses are 6 bytes long. */ + +#if defined(__linux__) + +#include "fd_neigh4_map.h" +#include "../ip/fd_netlink1.h" + +struct nlmsghdr; /* forward declaration */ + +/* FD_NEIGH_NETLINK_* gives error codes for netlink import operations. */ + +FD_PROTOTYPES_BEGIN + +/* fd_neigh4_netlink_request_dump requests a dump of the IPv4 neighbor + table for the given interface index. The kernel typically responds with + multi-part messages. */ + +int +fd_neigh4_netlink_request_dump( fd_netlink_t * netlink, + uint if_idx ); + +/* fd_neigh4_netlink_ingest_message imports an RTM_NEWNEIGH or RTM_DELNEIGH + message. Logs warning if a netlink message with a different type is + inserted. Logs warning if link-layer addresses is not 6 bytes long. + (The caller is expected to verify that if_idx is an Ethernet interface.) + Ignores messages with an interface index other than if_idx. Causes + insert, update, or remove of a neighbor table entry. Only respects + IPv4 neighbor entries. Silently ignores IPv6 neighbor entries. */ + +void +fd_neigh4_netlink_ingest_message( fd_neigh4_hmap_t * map, + struct nlmsghdr const * msg, + uint if_idx ); + +/* fd_neigh4_netlink_solicit requests the kernel to create a new neighbor + table entry and start an ARP request for it. Uses sendto(2) syscall. + Immediately tries to recvfrom(2) the error code. Assumes that netlink + socket is not bound and has no buffered messages. Returns 0 on success + and netlink error code on failure. The most common reason for failure + is EEXIST (neighbor entry already exists). */ + +int +fd_neigh4_netlink_solicit( fd_netlink_t * netlink, + uint if_idx, + uint ip4_addr ); + +FD_PROTOTYPES_END + +#endif /* defined(__linux__) */ diff --git a/src/waltz/neigh/test_neigh4_netlink.c b/src/waltz/neigh/test_neigh4_netlink.c new file mode 100644 index 0000000000..fb1a3609dd --- /dev/null +++ b/src/waltz/neigh/test_neigh4_netlink.c @@ -0,0 +1,155 @@ +#include "fd_neigh4_netlink.h" +#include +#include +#include /* AF_PACKET */ +#include +#include /* ARPHRD_ETHER */ +#include +#include +#include "../../util/fd_util.h" + +FD_STATIC_ASSERT( sizeof(fd_neigh4_entry_t)==16, layout ); + +static void +dump_neighbor_table( fd_neigh4_hmap_t * map, + fd_netlink_t * netlink1, + int if_idx ) { + fd_neigh4_netlink_request_dump( netlink1, (uint)if_idx ); + + uchar buf[ 4096 ]; + fd_netlink_iter_t iter[1]; + for( fd_netlink_iter_init( iter, netlink1, buf, sizeof(buf) ); + !fd_netlink_iter_done( iter ); + fd_netlink_iter_next( iter, netlink1 ) ) { + fd_neigh4_netlink_ingest_message( map, fd_netlink_iter_msg( iter ), (uint)if_idx ); + } + + char name[ IF_NAMESIZE ]; + fprintf( stderr, "# ip neigh show dev %s\n", if_indextoname( (uint)if_idx, name ) ); + fd_log_flush(); + fd_neigh4_hmap_fprintf( map, stderr ); + fputs( "\n", stderr ); + + /* Reinitialize table */ + + ulong ele_max = fd_neigh4_hmap_ele_max ( map ); + ulong lock_cnt = fd_neigh4_hmap_lock_cnt ( map ); + ulong probe_max = fd_neigh4_hmap_probe_max( map ); + ulong seed = fd_neigh4_hmap_seed ( map ); + void * shmap = fd_neigh4_hmap_shmap ( map ); + void * shele = fd_neigh4_hmap_shele ( map ); + void * ljoin = fd_neigh4_hmap_leave ( map ); + fd_neigh4_hmap_delete( shmap ); + fd_memset( shele, 0, ele_max*sizeof(fd_neigh4_entry_t) ); + FD_TEST( fd_neigh4_hmap_new( shmap, ele_max, lock_cnt, probe_max, seed ) ); + FD_TEST( fd_neigh4_hmap_join( ljoin, shmap, shele ) ); +} + +static void +dump_all_neighbor_tables( fd_neigh4_hmap_t * map, + fd_netlink_t * netlink0, + fd_netlink_t * netlink1 ) { + + /* List all network interfaces */ + + uint seq = netlink0->seq++; + struct { + struct nlmsghdr nlh; + struct ifinfomsg ifi; + } request; + request.nlh = (struct nlmsghdr){ + .nlmsg_len = sizeof(request), + .nlmsg_type = RTM_GETLINK, + .nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP, + .nlmsg_seq = seq + }; + request.ifi = (struct ifinfomsg){ + .ifi_family = AF_PACKET, + .ifi_type = ARPHRD_ETHER + }; + + long send_res = send( netlink0->fd, &request, sizeof(request), 0); + if( FD_UNLIKELY( send_res<0 ) ) { + FD_LOG_ERR(( "netlink send(RTM_GETLINK,NLM_F_REQUEST|NLM_F_DUMP,ARPHRD_ETHER) failed (%d-%s)", errno, fd_io_strerror( errno ) )); + } + if( FD_UNLIKELY( send_res!=sizeof(request) ) ) { + FD_LOG_ERR(( "netlink send(RTM_GETLINK,NLM_F_REQUEST|NLM_F_DUMP,ARPHRD_ETHER) failed (short write)" )); + } + + FD_LOG_NOTICE(( "Dumping neighbor tables for all Ethernet interfaces\n" )); + fd_log_flush(); + + uchar buf[ 4096 ]; + fd_netlink_iter_t iter[1]; + for( fd_netlink_iter_init( iter, netlink0, buf, sizeof(buf) ); + !fd_netlink_iter_done( iter ); + fd_netlink_iter_next( iter, netlink0 ) ) { + struct nlmsghdr const * nlh = fd_netlink_iter_msg( iter ); + if( FD_UNLIKELY( nlh->nlmsg_type==NLMSG_ERROR ) ) { + struct nlmsgerr * err = NLMSG_DATA( nlh ); + int nl_err = -err->error; + FD_LOG_ERR(( "netlink RTM_GETLINK,NLM_F_REQUEST|NLM_F_DUMP,ARPHRD_ETHER failed (%d-%s)", nl_err, fd_io_strerror( nl_err ) )); + } + if( FD_UNLIKELY( nlh->nlmsg_type!=RTM_NEWLINK ) ) { + FD_LOG_DEBUG(( "unexpected nlmsg_type %u", nlh->nlmsg_type )); + continue; + } + struct ifinfomsg const * ifi = NLMSG_DATA( nlh ); + + dump_neighbor_table( map, netlink1, ifi->ifi_index ); + } + +} + +int +main( int argc, + char ** argv ) { + fd_boot( &argc, &argv ); + + ulong cpu_idx = fd_tile_cpu_id( fd_tile_idx() ); + if( cpu_idx>=fd_shmem_cpu_cnt() ) cpu_idx = 0UL; + + char const * _page_sz = fd_env_strip_cmdline_cstr ( &argc, &argv, "--page-sz", NULL, "gigantic" ); + ulong page_cnt = fd_env_strip_cmdline_ulong ( &argc, &argv, "--page-cnt", NULL, 1UL ); + ulong numa_idx = fd_env_strip_cmdline_ulong ( &argc, &argv, "--numa-idx", NULL, fd_shmem_numa_idx(cpu_idx) ); + + ulong page_sz = fd_cstr_to_shmem_page_sz( _page_sz ); + if( FD_UNLIKELY( !page_sz ) ) FD_LOG_ERR(( "unsupported --page-sz" )); + + FD_LOG_NOTICE(( "Creating anonymous workspace with --page-cnt %lu --page-sz %s pages on --numa-idx %lu", page_cnt, _page_sz, numa_idx )); + fd_wksp_t * wksp = fd_wksp_new_anonymous( page_sz, page_cnt, fd_shmem_cpu_idx( numa_idx ), "wksp", 0UL ); + FD_TEST( wksp ); + + fd_netlink_t _netlink[2]; + fd_netlink_t * netlink0 = fd_netlink_init( _netlink+0, 42U ); + fd_netlink_t * netlink1 = fd_netlink_init( _netlink+1, 999U ); + FD_TEST( netlink0 ); + FD_TEST( netlink1 ); + + ulong ele_max = 16384UL; + ulong lock_cnt = 4UL; + ulong probe_max = 16UL; + ulong seed = 42UL; + void * hmap_mem = fd_wksp_alloc_laddr( wksp, fd_neigh4_hmap_align(), fd_neigh4_hmap_footprint( ele_max, lock_cnt, probe_max ), 1UL ); + void * ele_mem = fd_wksp_alloc_laddr( wksp, alignof(fd_neigh4_entry_t), ele_max*sizeof(fd_neigh4_entry_t), 1UL ); + FD_TEST( hmap_mem ); FD_TEST( ele_mem ); + FD_TEST( fd_neigh4_hmap_new( hmap_mem, ele_max, lock_cnt, probe_max, seed ) ); + + fd_neigh4_hmap_t _map[1]; + fd_neigh4_hmap_t * map = fd_neigh4_hmap_join( _map, hmap_mem, ele_mem ); + FD_TEST( map ); + + dump_all_neighbor_tables( map, netlink0, netlink1 ); + + fd_netlink_fini( netlink0 ); + fd_netlink_fini( netlink1 ); + + fd_neigh4_hmap_leave( map ); + fd_wksp_free_laddr( fd_neigh4_hmap_delete( hmap_mem ) ); + fd_wksp_free_laddr( ele_mem ); + fd_wksp_delete_anonymous( wksp ); + + FD_LOG_NOTICE(( "pass" )); + fd_halt(); + return 0; +}