diff --git a/book/.vitepress/config.mts b/book/.vitepress/config.mts
index cd2f03d3dd..3df03d81ad 100644
--- a/book/.vitepress/config.mts
+++ b/book/.vitepress/config.mts
@@ -52,6 +52,13 @@ export default defineConfig({
{ text: 'Troubleshooting', link: 'troubleshooting' },
{ text: 'Frequently Asked Questions', link: 'faq' },
]
+ },
+ {
+ text: 'Internals',
+ collapsed: false,
+ items: [
+ { text: 'Netlink', link: 'netlink' },
+ ]
}
] },
diff --git a/book/api/metrics-generated.md b/book/api/metrics-generated.md
index 5e922b7bae..effa9677e9 100644
--- a/book/api/metrics-generated.md
+++ b/book/api/metrics-generated.md
@@ -462,3 +462,18 @@
| gossip_gossip_peer_counts_total | `gauge` | Number of gossip peers tracked (Total Peers Detected) |
| gossip_gossip_peer_counts_active | `gauge` | Number of gossip peers tracked (Active) |
| gossip_gossip_peer_counts_inactive | `gauge` | Number of gossip peers tracked (Inactive) |
+
+## Netlnk Tile
+| Metric | Type | Description |
+|--------|------|-------------|
+| netlnk_drop_events | `counter` | Number of netlink drop events caught |
+| netlnk_link_full_syncs | `counter` | Number of full link table syncs done |
+| netlnk_route_full_syncs | `counter` | Number of full route table syncs done |
+| netlnk_updates_link | `counter` | Number of netlink live updates processed (Link) |
+| netlnk_updates_neigh | `counter` | Number of netlink live updates processed (Neighbor Table Entry) |
+| netlnk_updates_ipv4_route | `counter` | Number of netlink live updates processed (IPv4 Route Table Entry) |
+| netlnk_interface_count | `gauge` | Number of network interfaces |
+| netlnk_route_count_local | `gauge` | Number of IPv4 routes (Local) |
+| netlnk_route_count_main | `gauge` | Number of IPv4 routes (Main) |
+| netlnk_neighbor_solicits_sent | `counter` | Number of neighbor solicit requests sent to kernel |
+| netlnk_neighbor_solicits_fails | `counter` | Number of neighbor solicit requests that failed to send |
diff --git a/book/guide/netlink.md b/book/guide/netlink.md
new file mode 100644
index 0000000000..7fd6046850
--- /dev/null
+++ b/book/guide/netlink.md
@@ -0,0 +1,111 @@
+# Netlink Integration
+
+## Summary
+
+Firedancer's userland networking stack sources configuration from netlink
+to allow mostly zero config interoperability with Linux.
+
+This contrasts with other fast networking stacks which typically require
+complex network configuration or a dedicated IP address.
+
+The following describes the netlink integration in detail.
+
+## Tile Overview
+
+Firedancer uses XDP for fast networking. This means that some packet
+processing steps traditionally done in the kernel (with UDP sockets) now
+have to be done in the Firedancer software. Specifically routing and
+resolving link-level neighbors.
+
+The required information in these steps is requested from the kernel via
+the [rtnetlink API](https://man7.org/linux/man-pages/man7/rtnetlink.7.html).
+Doing all netlink requests in the data path (i.e. in the net tile) bears
+security risk and is slow.
+
+The reasons netlink requests are done in a separate tile are:
+- **Improved security architecture.** Firedancer's sandbox isolates the
+ netlink interface from untrusted user traffic
+- **Better performance.** The netlink tile provides shared memory caches
+ that greatly reduce the amount of netlink requests.
+
+### "Netbase" shared memory region
+
+The netlink tile keeps a read-only cache of the following information:
+
+- Interface table
+- IPv4 route tables `local` and `main`
+- Neighbor tables (only for XDP-enabled Ethernet interfaces)
+
+The objects containing the above information are stored in the "netbase"
+workspace. (A workspace is a shared memory region)
+
+### Security
+
+A netlink tile requires an rtnetlink socket. On startup, it subscribes
+to route and neighbor table changes. It will also issue RTM_GETROUTE
+and RTM_GETNEIGH requests. On RHEL 8 with a Linux 4.18 kernel, all
+netlink interactions (including creation of the socket) can be done from
+a regular unprivileged user without capabilities.
+
+The kernel's netlink interface exposes a large attack surface.
+Therefore, this tile attempts to isolate itself from direct untrusted
+inputs.
+
+### Data flows
+
+- `[net tiles] <-- [netbase]`
+ Net tiles have read only access to the shared memory region backing
+ the netbase object. A malicious netlink tile can compromise net tiles
+ by corrupting the netbase object, but not vice versa.
+
+- `[changes by sysadmin] --> [netlink] --> [netlink tile]`
+ Route table updates are forwarded to the netlink tile. This occurs
+ rarely (typically if the sysadmin performs manual changes or if due to
+ a system daemon).
+
+- `[netlink tile] --> [netbase]`
+ The netlink tile writes neighbor and route table updates to a shared
+ memory region.
+
+- `[neighbor discovery] --> [netlink] --> [netlink tile]`
+ Neighbor table updates are forwarded ot the netlink tile. This path
+ has limited throughput (few ~100K updates per second).
+
+- `[untrusted traffic] --> [net tile] --> [app tile]`
+ `--> [net tile] --> [netlink tile] --> [neighbor discovery]`
+ App tiles will blindly respond to the source IP found in untrusted
+ packets. This source IP can be spoofed. Neighbor solicitation might
+ be required in order to find out the MAC address of that IP. On IPv4,
+ these are ARP requests broadcasted to the local network.
+
+ Net tiles cannot solicit neighbors directly, so they notify the
+ netlink tile that neighbor solicitation is needed. (Potentially at
+ line rate if network configuration is part of a huge subnet)
+
+ The netlink tile will deduplicate these requests and forward them to
+ the kernel.
+
+ This path is the only direct 'untrusted traffic' -> 'netlink tile'
+ data flow, so the internal neighbor solicit message format is kept
+ as simple as possbile for security.
+
+### Neighbor discovery (ARP)
+
+A concurrent open addressed hash table is used to store ARP entries
+(henceforth called "neighbor table"). This table attempts to
+continuously stay in sync with the kernel.
+
+The netlink tile requests neighbor solicitations via the netlink
+equivalent of `ip neigh add dev DEVICE IP use`.
+
+### Routing
+
+The Firedancer network stack supports very simple routing tables as
+typically seen on cloud instances, servers directly connected to an
+Ethernet switch, or a router.
+
+Only the "local" and "main" routing tables are synchronized. Policy
+based routing and additional routing tables are NOT supported.
+
+Outgoing traffic matching the "local" table is sent to the loopback
+device.
diff --git a/src/app/fdctl/Local.mk b/src/app/fdctl/Local.mk
index 8cef9b5c0b..a47da0fe2b 100644
--- a/src/app/fdctl/Local.mk
+++ b/src/app/fdctl/Local.mk
@@ -18,6 +18,7 @@ $(OBJDIR)/obj/app/fdctl/version.d: src/app/fdctl/version.h
# fdctl core
$(call add-objs,main1 config config_parse caps utility keys ready mem spy help version,fd_fdctl)
+$(call add-objs,netconf,fd_fdctl)
$(call add-objs,run/run run/run1 run/run_agave,fd_fdctl)
$(call add-objs,monitor/monitor monitor/helper,fd_fdctl)
$(call make-fuzz-test,fuzz_fdctl_config,fuzz_fdctl_config,fd_fdctl fd_ballet fd_util)
diff --git a/src/app/fdctl/config.c b/src/app/fdctl/config.c
index 083fdbb36c..945c7036ba 100644
--- a/src/app/fdctl/config.c
+++ b/src/app/fdctl/config.c
@@ -11,6 +11,9 @@
#include "../../flamenco/runtime/fd_blockstore.h"
#include "../../flamenco/runtime/fd_txncache.h"
#include "../../funk/fd_funk.h"
+#include "../../waltz/ip/fd_fib4.h"
+#include "../../waltz/mib/fd_dbl_buf.h"
+#include "../../waltz/neigh/fd_neigh4_map.h"
#include "../../util/net/fd_eth.h"
#include "../../util/net/fd_ip4.h"
@@ -218,12 +221,22 @@ fdctl_obj_align( fd_topo_t const * topo,
return fd_fseq_align();
} else if( FD_UNLIKELY( !strcmp( obj->name, "metrics" ) ) ) {
return FD_METRICS_ALIGN;
+ } else if( FD_UNLIKELY( !strcmp( obj->name, "opaque" ) ) ) {
+ ulong align = fd_pod_queryf_ulong( topo->props, ULONG_MAX, "obj.%lu.align", obj->id );
+ if( FD_UNLIKELY( align==ULONG_MAX ) ) FD_LOG_ERR(( "obj.%lu.align was not set", obj->id ));
+ return align;
+ } else if( FD_UNLIKELY( !strcmp( obj->name, "dbl_buf" ) ) ) {
+ return fd_dbl_buf_align();
} else if( FD_UNLIKELY( !strcmp( obj->name, "blockstore" ) ) ) {
return fd_blockstore_align();
} else if( FD_UNLIKELY( !strcmp( obj->name, "funk" ) ) ) {
return fd_funk_align();
} else if( FD_UNLIKELY( !strcmp( obj->name, "txncache" ) ) ) {
return fd_txncache_align();
+ } else if( FD_UNLIKELY( !strcmp( obj->name, "neigh4_hmap" ) ) ) {
+ return fd_neigh4_hmap_align();
+ } else if( FD_UNLIKELY( !strcmp( obj->name, "fib4" ) ) ) {
+ return fd_fib4_align();
} else {
FD_LOG_ERR(( "unknown object `%s`", obj->name ));
return 0UL;
@@ -259,12 +272,20 @@ fdctl_obj_footprint( fd_topo_t const * topo,
return fd_fseq_footprint();
} else if( FD_UNLIKELY( !strcmp( obj->name, "metrics" ) ) ) {
return FD_METRICS_FOOTPRINT( VAL("in_cnt"), VAL("cons_cnt") );
+ } else if( FD_UNLIKELY( !strcmp( obj->name, "opaque" ) ) ) {
+ return VAL("footprint");
+ } else if( FD_UNLIKELY( !strcmp( obj->name, "dbl_buf" ) ) ) {
+ return fd_dbl_buf_footprint( VAL("mtu") );
} else if( FD_UNLIKELY( !strcmp( obj->name, "blockstore" ) ) ) {
return fd_blockstore_footprint( VAL("shred_max"), VAL("block_max"), VAL("idx_max"), VAL("txn_max") ) + VAL("alloc_max");
} else if( FD_UNLIKELY( !strcmp( obj->name, "funk" ) ) ) {
return fd_funk_footprint();
} else if( FD_UNLIKELY( !strcmp( obj->name, "txncache" ) ) ) {
return fd_txncache_footprint( VAL("max_rooted_slots"), VAL("max_live_slots"), VAL("max_txn_per_slot"), FD_TXNCACHE_DEFAULT_MAX_CONSTIPATED_SLOTS );
+ } else if( FD_UNLIKELY( !strcmp( obj->name, "neigh4_hmap" ) ) ) {
+ return fd_neigh4_hmap_footprint( VAL("ele_max"), VAL("lock_cnt"), VAL("probe_max") );
+ } else if( FD_UNLIKELY( !strcmp( obj->name, "fib4" ) ) ) {
+ return fd_fib4_footprint( VAL("route_max") );
} else {
FD_LOG_ERR(( "unknown object `%s`", obj->name ));
return 0UL;
@@ -504,34 +525,6 @@ fdctl_cfg_from_env( int * pargc,
config->tiles.net.ip_addr = iface_ip;
mac_address( config->tiles.net.interface, config->tiles.net.mac_addr );
- /* support for multihomed hosts */
- ulong multi_cnt = config->tiles.net.multihome_ip_addrs_cnt;
- for( ulong j = 0; j < multi_cnt; ++j ) {
- int success = fd_cstr_to_ip4_addr( config->tiles.net.multihome_ip_addrs[j],
- &config->tiles.net.multihome_ip4_addrs[j] );
- if( !success ) {
- FD_LOG_ERR(( "configuration option [tiles.net.multihome_ip_addrs] "
- "specifies malformed IP address `%s`",
- config->tiles.net.multihome_ip_addrs[j] ));
- }
- }
-
- /* look for duplicate addresses */
- /* there's only a few, so do the O(n^2) comparison */
- for( ulong j = 0; j < multi_cnt; ++j ) {
- if( config->tiles.net.ip_addr == config->tiles.net.multihome_ip4_addrs[j] ) {
- FD_LOG_ERR(( "configuration option [tiles.net.multihome_ip_addrs] "
- "specifies an address that matches [tiles.net.src_ip_addr]" ));
- }
- for( ulong k = j+1; k < multi_cnt; ++k ) {
- if( config->tiles.net.multihome_ip4_addrs[j] == config->tiles.net.multihome_ip4_addrs[k] ) {
- FD_LOG_ERR(( "configuration option [tiles.net.multihome_ip_addrs] "
- "specifies duplicate ip addresses `%s`",
- config->tiles.net.multihome_ip_addrs[j] ));
- }
- }
- }
-
}
username_to_id( config );
diff --git a/src/app/fdctl/config.h b/src/app/fdctl/config.h
index f8718ac960..ae22828a9b 100644
--- a/src/app/fdctl/config.h
+++ b/src/app/fdctl/config.h
@@ -17,7 +17,7 @@
/* config_t represents all available configuration options that could be
set in a user defined configuration toml file. For information about
the options, see the `default.toml` file provided. */
-typedef struct {
+struct fdctl_config {
char name[ NAME_SZ ];
char user[ 256 ];
char hostname[ FD_LOG_NAME_MAX ];
@@ -216,12 +216,13 @@ typedef struct {
uint xdp_aio_depth;
uint send_buffer_size;
-
- ulong multihome_ip_addrs_cnt; /* number of home ip addresses */
- char multihome_ip_addrs[FD_NET_MAX_SRC_ADDR][32];
- uint multihome_ip4_addrs[FD_NET_MAX_SRC_ADDR];
} net;
+ struct {
+ ulong max_routes;
+ ulong max_neighbors;
+ } netlink;
+
struct {
ushort regular_transaction_listen_port;
ushort quic_transaction_listen_port;
@@ -319,7 +320,9 @@ typedef struct {
} batch;
} tiles;
-} config_t;
+};
+
+typedef struct fdctl_config config_t;
FD_PROTOTYPES_BEGIN
diff --git a/src/app/fdctl/config/default.toml b/src/app/fdctl/config/default.toml
index 9a738e24fd..53f2b15b8d 100644
--- a/src/app/fdctl/config/default.toml
+++ b/src/app/fdctl/config/default.toml
@@ -882,12 +882,29 @@ dynamic_port_range = "8900-9000"
# this really be configurable?
send_buffer_size = 16384
- # The XDP program will filter packets that aren't destined for
- # the IPv4 address of the interface bound above, but sometimes a
- # validator may advertise multiple IP addresses. In this case
- # the additional addresses can be specified here, and packets
- # addressed to them will be accepted.
- multihome_ip_addrs = []
+ # The netlink tile forwards Linux network configuration to net tiles.
+ # This config section contains advanced options that typically do not
+ # need to be changed.
+ # For further info, see https://docs.firedancer.io/guide/netlink.html
+ [tiles.netlink]
+ # The maximum number of routes per route table.
+ #
+ # The netlink tile imports two route tables from Linux, namely
+ # `local` and `main`. You can view them by running
+ # `ip route show table main`. Decreasing this option can result
+ # in connectivity issues. Increasing this option can drastically
+ # decrease performance.
+ #
+ # For virtually all cloud and bare-metal server providers, the
+ # number of routes per table does not exceed 16.
+ max_routes = 128
+
+ # The maximum number of Ethernet neighbors.
+ #
+ # This should be roughly as large as the size your Ethernet subnet.
+ # E.g. if your IP address is 198.51.100.3/24, then your subnet has
+ # up to 256 neighbors (2^(32-24)).
+ max_neighbors = 4096
# QUIC tiles are responsible for serving network traffic, including
# parsing and responding to packets and managing connection timeouts
diff --git a/src/app/fdctl/config_parse.c b/src/app/fdctl/config_parse.c
index f6eb07f4c8..9a7639f85a 100644
--- a/src/app/fdctl/config_parse.c
+++ b/src/app/fdctl/config_parse.c
@@ -290,7 +290,9 @@ fdctl_pod_to_cfg( config_t * config,
CFG_POP ( uint, tiles.net.xdp_tx_queue_size );
CFG_POP ( uint, tiles.net.xdp_aio_depth );
CFG_POP ( uint, tiles.net.send_buffer_size );
- CFG_POP_ARRAY( cstr, tiles.net.multihome_ip_addrs );
+
+ CFG_POP ( ulong, tiles.netlink.max_routes );
+ CFG_POP ( ulong, tiles.netlink.max_neighbors );
CFG_POP ( ushort, tiles.quic.regular_transaction_listen_port );
CFG_POP ( ushort, tiles.quic.quic_transaction_listen_port );
@@ -461,6 +463,9 @@ fdctl_cfg_validate( config_t * cfg ) {
CFG_HAS_NON_ZERO ( tiles.net.xdp_aio_depth );
CFG_HAS_NON_ZERO ( tiles.net.send_buffer_size );
+ CFG_HAS_NON_ZERO( tiles.netlink.max_routes );
+ CFG_HAS_NON_ZERO( tiles.netlink.max_neighbors );
+
CFG_HAS_NON_ZERO( tiles.quic.regular_transaction_listen_port );
CFG_HAS_NON_ZERO( tiles.quic.quic_transaction_listen_port );
CFG_HAS_NON_ZERO( tiles.quic.max_concurrent_connections );
diff --git a/src/app/fdctl/fdctl.h b/src/app/fdctl/fdctl.h
index 56ef0724b5..2079cb5b1a 100644
--- a/src/app/fdctl/fdctl.h
+++ b/src/app/fdctl/fdctl.h
@@ -123,8 +123,7 @@ fdctl_obj_loose( fd_topo_t const * topo,
fd_topo_run_tile_t
fdctl_tile_run( fd_topo_tile_t * tile );
-#define ACTIONS_CNT (11UL)
-extern action_t ACTIONS[ ACTIONS_CNT ];
+extern action_t ACTIONS[];
void fdctl_boot( int * pargc,
char *** pargv,
@@ -207,6 +206,10 @@ void
spy_cmd_fn( args_t * args,
config_t * const config );
+void
+netconf_cmd_fn( args_t * args,
+ config_t * config );
+
void
help_cmd_fn( args_t * args,
config_t * const config );
diff --git a/src/app/fdctl/help.c b/src/app/fdctl/help.c
index f795b1da83..7c1735e874 100644
--- a/src/app/fdctl/help.c
+++ b/src/app/fdctl/help.c
@@ -13,7 +13,7 @@ help_cmd_fn( args_t * args,
--config parameter. */
FD_LOG_STDOUT(( " --config Path to config TOML file\n\n" ));
FD_LOG_STDOUT(( "SUBCOMMANDS:\n" ));
- for( ulong i=0; i
#include
-action_t ACTIONS[ ACTIONS_CNT ] = {
+action_t ACTIONS[] = {
{ .name = "run", .args = NULL, .fn = run_cmd_fn, .perm = run_cmd_perm, .description = "Start up a Firedancer validator" },
{ .name = "run1", .args = run1_cmd_args, .fn = run1_cmd_fn, .perm = NULL, .description = "Start up a single Firedancer tile" },
{ .name = "run-agave", .args = NULL, .fn = run_agave_cmd_fn, .perm = NULL, .description = "Start up the Agave side of a Firedancer validator" },
@@ -16,8 +16,10 @@ action_t ACTIONS[ ACTIONS_CNT ] = {
{ .name = "ready", .args = NULL, .fn = ready_cmd_fn, .perm = NULL, .description = "Wait for all tiles to be running" },
{ .name = "mem", .args = NULL, .fn = mem_cmd_fn, .perm = NULL, .description = "Print workspace memory and tile topology information" },
{ .name = "spy", .args = NULL, .fn = spy_cmd_fn, .perm = NULL, .description = "Spy on and print out gossip traffic" },
+ { .name = "netconf", .args = NULL, .fn = netconf_cmd_fn, .perm = NULL, .description = "Print network configuration" },
{ .name = "help", .args = NULL, .fn = help_cmd_fn, .perm = NULL, .description = "Print this help message" },
{ .name = "version", .args = NULL, .fn = version_cmd_fn, .perm = NULL, .description = "Show the current software version" },
+ {0}
};
struct action_alias {
@@ -278,7 +280,7 @@ main1( int argc,
}
action_t * action = NULL;
- for( ulong i=0; i
+#include
+
+void
+netconf_cmd_fn( args_t * args,
+ config_t * config ) {
+ (void)args;
+
+ fd_topo_t * topo = &config->topo;
+ ulong wksp_id = fd_topo_find_wksp( topo, "netbase" );
+ if( FD_UNLIKELY( wksp_id==ULONG_MAX ) ) {
+ FD_LOG_ERR(( "netbase workspace not found" ));
+ }
+ fd_topo_wksp_t * netbase = &topo->workspaces[ wksp_id ];
+
+ ulong tile_id = fd_topo_find_tile( topo, "netlnk", 0UL );
+ if( FD_UNLIKELY( tile_id==ULONG_MAX ) ) {
+ FD_LOG_ERR(( "netlnk tile not found" ));
+ }
+ fd_topo_tile_t * tile = &topo->tiles[ tile_id ];
+
+ fd_topo_join_workspace( topo, netbase, FD_SHMEM_JOIN_MODE_READ_ONLY );
+
+ puts( "\nINTERFACES\n" );
+ fd_dbl_buf_t * netdev_buf = fd_dbl_buf_join( fd_topo_obj_laddr( topo, tile->netlink.netdev_dbl_buf_obj_id ) );
+ FD_TEST( netdev_buf );
+ void * netdev_copy = aligned_alloc( fd_netdev_tbl_align(), fd_dbl_buf_obj_mtu( netdev_buf ) );
+ fd_dbl_buf_read( netdev_buf, netdev_copy, NULL );
+ fd_netdev_tbl_join_t netdev[1];
+ FD_TEST( fd_netdev_tbl_join( netdev, netdev_copy ) );
+ fd_netdev_tbl_fprintf( netdev, stdout );
+ fd_netdev_tbl_leave( netdev );
+ free( netdev_copy );
+ fd_dbl_buf_leave( netdev_buf );
+
+ puts( "\nIPv4 ROUTES (main)\n" );
+ fd_fib4_t * fib4_main = fd_fib4_join( fd_topo_obj_laddr( topo, tile->netlink.fib4_main_obj_id ) );
+ FD_TEST( fib4_main );
+ fd_fib4_fprintf( fib4_main, stdout );
+ fd_fib4_leave( fib4_main );
+
+ puts( "\nIPv4 ROUTES (local)\n" );
+ fd_fib4_t * fib4_local = fd_fib4_join( fd_topo_obj_laddr( topo, tile->netlink.fib4_local_obj_id ) );
+ FD_TEST( fib4_local );
+ fd_fib4_fprintf( fib4_local, stdout );
+ fd_fib4_leave( fib4_local );
+
+ char if_name[ IF_NAMESIZE ] = "???";
+ if( FD_UNLIKELY( !if_indextoname( tile->netlink.neigh_if_idx, if_name ) ) ) {
+ memcpy( if_name, "???", 4 );
+ }
+ printf( "\nNEIGHBOR TABLE (%u-%s)\n\n", tile->netlink.neigh_if_idx, if_name );
+ fd_neigh4_hmap_t neigh4[1];
+ FD_TEST( fd_neigh4_hmap_join( neigh4, fd_topo_obj_laddr( topo, tile->netlink.neigh4_obj_id ), fd_topo_obj_laddr( topo, tile->netlink.neigh4_ele_obj_id ) ) );
+ fd_neigh4_hmap_fprintf( neigh4, stdout );
+ fd_neigh4_hmap_leave( neigh4 );
+
+ puts( "" );
+}
diff --git a/src/app/fdctl/run/run.c b/src/app/fdctl/run/run.c
index 98f4831de7..f00522aff7 100644
--- a/src/app/fdctl/run/run.c
+++ b/src/app/fdctl/run/run.c
@@ -15,6 +15,9 @@
#include "../../../flamenco/runtime/fd_txncache.h"
#include "../../../funk/fd_funk_filemap.h"
#include "../../../funk/fd_funk.h"
+#include "../../../waltz/ip/fd_fib4.h"
+#include "../../../waltz/mib/fd_dbl_buf.h"
+#include "../../../waltz/neigh/fd_neigh4_map.h"
#include "../configure/configure.h"
#include
@@ -538,14 +541,22 @@ fdctl_obj_new( fd_topo_t const * topo,
FD_TEST( fd_fseq_new( laddr, ULONG_MAX ) );
} else if( FD_UNLIKELY( !strcmp( obj->name, "metrics" ) ) ) {
FD_TEST( fd_metrics_new( laddr, VAL("in_cnt"), VAL("cons_cnt") ) );
+ } else if( FD_UNLIKELY( !strcmp( obj->name, "opaque" ) ) ) {
+ fd_memset( laddr, 0, VAL("footprint") );
} else if( FD_UNLIKELY( !strcmp( obj->name, "ulong" ) ) ) {
*(ulong*)laddr = 0;
+ } else if( FD_UNLIKELY( !strcmp( obj->name, "dbl_buf" ) ) ) {
+ FD_TEST( fd_dbl_buf_new( laddr, VAL("mtu"), 1UL ) );
} else if( FD_UNLIKELY( !strcmp( obj->name, "blockstore" ) ) ) {
FD_TEST( fd_blockstore_new( laddr, VAL("wksp_tag"), VAL("seed"), VAL("shred_max"), VAL("block_max"), VAL("idx_max"), VAL("txn_max") ) );
} else if( FD_UNLIKELY( !strcmp( obj->name, "funk" ) ) ) {
FD_TEST( fd_funk_new( laddr, VAL("wksp_tag"), VAL("seed"), VAL("txn_max"), VAL("rec_max") ) );
} else if( FD_UNLIKELY( !strcmp( obj->name, "txncache" ) ) ) {
FD_TEST( fd_txncache_new( laddr, VAL("max_rooted_slots"), VAL("max_live_slots"), VAL("max_txn_per_slot"), FD_TXNCACHE_DEFAULT_MAX_CONSTIPATED_SLOTS ) );
+ } else if( FD_UNLIKELY( !strcmp( obj->name, "neigh4_hmap" ) ) ) {
+ FD_TEST( fd_neigh4_hmap_new( laddr, VAL("ele_max"), VAL("lock_cnt"), VAL("probe_max"), VAL("seed") ) );
+ } else if( FD_UNLIKELY( !strcmp( obj->name, "fib4" ) ) ) {
+ FD_TEST( fd_fib4_new( laddr, VAL("route_max") ) );
} else {
FD_LOG_ERR(( "unknown object `%s`", obj->name ));
}
diff --git a/src/app/fdctl/run/topos/fd_firedancer.c b/src/app/fdctl/run/topos/fd_firedancer.c
index b01b0a3e6e..185a79fedb 100644
--- a/src/app/fdctl/run/topos/fd_firedancer.c
+++ b/src/app/fdctl/run/topos/fd_firedancer.c
@@ -8,6 +8,7 @@
#include "../../../../disco/tiles.h"
#include "../../../../disco/topo/fd_topob.h"
#include "../../../../disco/topo/fd_pod_format.h"
+#include "../../../../disco/netlink/fd_netlink_tile.h" /* fd_netlink_topo_create */
#include "../../../../flamenco/runtime/fd_blockstore.h"
#include "../../../../flamenco/runtime/fd_runtime.h"
#include "../../../../flamenco/runtime/fd_txncache.h"
@@ -78,6 +79,7 @@ fd_topo_initialize( config_t * config ) {
fd_topo_t * topo = { fd_topob_new( &config->topo, config->name ) };
/* topo, name */
+ fd_topob_wksp( topo, "netbase" );
fd_topob_wksp( topo, "net_shred" );
fd_topob_wksp( topo, "net_gossip" );
fd_topob_wksp( topo, "net_repair" );
@@ -131,6 +133,7 @@ fd_topo_initialize( config_t * config ) {
fd_topob_wksp( topo, "batch_replay" );
fd_topob_wksp( topo, "net" );
+ fd_topob_wksp( topo, "netlink" );
fd_topob_wksp( topo, "quic" );
fd_topob_wksp( topo, "verify" );
fd_topob_wksp( topo, "dedup" );
@@ -239,6 +242,7 @@ fd_topo_initialize( config_t * config ) {
/* topo, tile_name, tile_wksp, metrics_wksp, cpu_idx, is_agave */
FOR(net_tile_cnt) fd_topob_tile( topo, "net", "net", "metric_in", tile_to_cpu[ topo->tile_cnt ], 0 );
+ fd_topo_tile_t * netlink_tile = fd_topob_tile( topo, "netlnk" , "netlink", "metric_in", ULONG_MAX, 0 );
FOR(quic_tile_cnt) fd_topob_tile( topo, "quic", "quic", "metric_in", tile_to_cpu[ topo->tile_cnt ], 0 );
FOR(verify_tile_cnt) fd_topob_tile( topo, "verify", "verify", "metric_in", tile_to_cpu[ topo->tile_cnt ], 0 );
/**/ fd_topob_tile( topo, "dedup", "dedup", "metric_in", tile_to_cpu[ topo->tile_cnt ], 0 );
@@ -346,6 +350,13 @@ fd_topo_initialize( config_t * config ) {
topo->tile_cnt, affinity_tile_cnt ));
}
+ /* The netlink tile shares various objects to net tiles */
+ fd_netlink_topo_create( netlink_tile, topo, config );
+ for( ulong i=0UL; itiles[ net_tile_id ] );
+ }
+
/* topo, tile_name, tile_kind_id, fseq_wksp, link_name, link_kind_id, reliable, polled */
FOR(net_tile_cnt) for( ulong j=0UL; jnet.repair_intake_listen_port = config->tiles.repair.repair_intake_listen_port;
tile->net.repair_serve_listen_port = config->tiles.repair.repair_serve_listen_port;
- /* multihome support */
- ulong multi_cnt = tile->net.multihome_ip_addrs_cnt = config->tiles.net.multihome_ip_addrs_cnt;
- for( ulong j = 0; j < multi_cnt; ++j ) {
- tile->net.multihome_ip_addrs[j] = config->tiles.net.multihome_ip4_addrs[j];
- }
+ } else if( FD_UNLIKELY( !strcmp( tile->name, "netlnk" ) ) ) {
+
+ /* already configured */
} else if( FD_UNLIKELY( !strcmp( tile->name, "quic" ) ) ) {
fd_memcpy( tile->quic.src_mac_addr, config->tiles.net.mac_addr, 6 );
diff --git a/src/app/fdctl/run/topos/fd_frankendancer.c b/src/app/fdctl/run/topos/fd_frankendancer.c
index c6cce58f2e..081592f9d3 100644
--- a/src/app/fdctl/run/topos/fd_frankendancer.c
+++ b/src/app/fdctl/run/topos/fd_frankendancer.c
@@ -4,6 +4,7 @@
#include "../../../../disco/tiles.h"
#include "../../../../disco/topo/fd_topob.h"
#include "../../../../disco/topo/fd_pod_format.h"
+#include "../../../../disco/netlink/fd_netlink_tile.h" /* fd_netlink_topo_create */
#include "../../../../util/tile/fd_tile_private.h"
#include "../../../../util/shmem/fd_shmem_private.h"
@@ -19,6 +20,7 @@ fd_topo_initialize( config_t * config ) {
fd_topo_t * topo = { fd_topob_new( &config->topo, config->name ) };
/* topo, name */
+ fd_topob_wksp( topo, "netbase" );
fd_topob_wksp( topo, "net_quic" );
fd_topob_wksp( topo, "net_shred" );
fd_topob_wksp( topo, "quic_verify" );
@@ -38,6 +40,7 @@ fd_topo_initialize( config_t * config ) {
fd_topob_wksp( topo, "sign_shred" );
fd_topob_wksp( topo, "net" );
+ fd_topob_wksp( topo, "netlink" );
fd_topob_wksp( topo, "quic" );
fd_topob_wksp( topo, "verify" );
fd_topob_wksp( topo, "dedup" );
@@ -106,6 +109,8 @@ fd_topo_initialize( config_t * config ) {
/* topo, tile_name, tile_wksp, metrics_wksp, cpu_idx, is_agave */
FOR(net_tile_cnt) fd_topob_tile( topo, "net", "net", "metric_in", tile_to_cpu[ topo->tile_cnt ], 0 );
+ fd_topo_tile_t * netlink_tile =
+ /**/ fd_topob_tile( topo, "netlnk" , "netlink", "metric_in", ULONG_MAX, 0 );
FOR(quic_tile_cnt) fd_topob_tile( topo, "quic", "quic", "metric_in", tile_to_cpu[ topo->tile_cnt ], 0 );
FOR(verify_tile_cnt) fd_topob_tile( topo, "verify", "verify", "metric_in", tile_to_cpu[ topo->tile_cnt ], 0 );
/**/ fd_topob_tile( topo, "dedup", "dedup", "metric_in", tile_to_cpu[ topo->tile_cnt ], 0 );
@@ -157,6 +162,13 @@ fd_topo_initialize( config_t * config ) {
}
}
+ /* The netlink tile shares various objects to net tiles */
+ fd_netlink_topo_create( netlink_tile, topo, config );
+ for( ulong i=0UL; itiles[ net_tile_id ] );
+ }
+
/* topo, tile_name, tile_kind_id, fseq_wksp, link_name, link_kind_id, reliable, polled */
FOR(net_tile_cnt) for( ulong j=0UL; jnet.quic_transaction_listen_port = config->tiles.quic.quic_transaction_listen_port;
tile->net.legacy_transaction_listen_port = config->tiles.quic.regular_transaction_listen_port;
- /* multihome support */
- ulong multi_cnt = tile->net.multihome_ip_addrs_cnt = config->tiles.net.multihome_ip_addrs_cnt;
- for( ulong j = 0; j < multi_cnt; ++j ) {
- tile->net.multihome_ip_addrs[j] = config->tiles.net.multihome_ip4_addrs[j];
- }
+ } else if( FD_UNLIKELY( !strcmp( tile->name, "netlnk" ) ) ) {
+
+ /* already configured */
+
} else if( FD_UNLIKELY( !strcmp( tile->name, "quic" ) ) ) {
fd_memcpy( tile->quic.src_mac_addr, config->tiles.net.mac_addr, 6 );
diff --git a/src/app/fddev/main1.c b/src/app/fddev/main1.c
index b6befd47c9..4a537eb304 100644
--- a/src/app/fddev/main1.c
+++ b/src/app/fddev/main1.c
@@ -34,6 +34,7 @@ configure_stage_t * STAGES[ CONFIGURE_STAGE_COUNT ] = {
};
extern fd_topo_run_tile_t fd_tile_net;
+extern fd_topo_run_tile_t fd_tile_netlink;
extern fd_topo_run_tile_t fd_tile_quic;
extern fd_topo_run_tile_t fd_tile_verify;
extern fd_topo_run_tile_t fd_tile_dedup;
@@ -69,6 +70,7 @@ extern fd_topo_run_tile_t fd_tile_rpcserv;
fd_topo_run_tile_t * TILES[] = {
&fd_tile_net,
+ &fd_tile_netlink,
&fd_tile_quic,
&fd_tile_verify,
&fd_tile_dedup,
@@ -187,13 +189,13 @@ fddev_main( int argc,
}
action_t * action = NULL;
- for( ulong i=0; i int:
return 8
-
+
def count(self) -> int:
return 1
@@ -85,7 +86,7 @@ def __init__(self, name: str, tile: Optional[Tile], description: str, clickhouse
def footprint(self) -> int:
return 8 * len(self.enum.values)
-
+
def count(self) -> int:
return len(self.enum.values)
@@ -97,7 +98,7 @@ def __init__(self, name: str, tile: Optional[Tile], description: str, clickhouse
def footprint(self) -> int:
return 8 * len(self.enum.values)
-
+
def count(self) -> int:
return len(self.enum.values)
@@ -140,7 +141,7 @@ def layout(self):
def parse_metric(tile: Optional[Tile], metric: ET.Element, enums: Dict[str, MetricEnum]) -> Metric:
name = metric.attrib['name']
description = ""
-
+
summary_ele = metric.find('summary')
if summary_ele is not None and summary_ele.text is not None:
description = summary_ele.text
@@ -201,7 +202,7 @@ def parse_metrics(xml_data: str) -> Metrics:
Tile[tile.attrib['name'].upper()]: [
parse_metric(Tile[tile.attrib['name'].upper()], metric, enums)
for metric in tile
- ]
+ ]
for tile in root.findall('tile')
}
@@ -212,5 +213,5 @@ def parse_metrics(xml_data: str) -> Metrics:
link_out = root.find('linkout')
assert link_out is not None
link_out = [parse_metric(None, metric, enums) for metric in link_out]
-
+
return Metrics(common=common, tiles=tiles, link_in=link_in, link_out=link_out, enums=enums)
\ No newline at end of file
diff --git a/src/disco/metrics/generated/Local.mk b/src/disco/metrics/generated/Local.mk
index b497a7ac80..e3382537fc 100644
--- a/src/disco/metrics/generated/Local.mk
+++ b/src/disco/metrics/generated/Local.mk
@@ -1,2 +1,2 @@
-$(call add-hdrs,fd_metrics_all.h fd_metrics_quic.h)
-$(call add-objs,fd_metrics_all fd_metrics_net fd_metrics_quic fd_metrics_verify fd_metrics_dedup fd_metrics_resolv fd_metrics_pack fd_metrics_bank fd_metrics_poh fd_metrics_store fd_metrics_shred fd_metrics_replay fd_metrics_storei fd_metrics_gossip,fd_disco)
+$(call add-hdrs,$(notdir $(wildcard $(MKPATH)/*.h)))
+$(call add-objs,$(patsubst %.c,%,$(notdir $(wildcard $(MKPATH)/*.c))),fd_disco)
diff --git a/src/disco/metrics/generated/fd_metrics_all.c b/src/disco/metrics/generated/fd_metrics_all.c
index bc01aeb5d2..e6f4da236e 100644
--- a/src/disco/metrics/generated/fd_metrics_all.c
+++ b/src/disco/metrics/generated/fd_metrics_all.c
@@ -49,6 +49,7 @@ const char * FD_METRICS_TILE_KIND_NAMES[FD_METRICS_TILE_KIND_CNT] = {
"replay",
"storei",
"gossip",
+ "netlnk",
};
const ulong FD_METRICS_TILE_KIND_SIZES[FD_METRICS_TILE_KIND_CNT] = {
@@ -65,6 +66,7 @@ const ulong FD_METRICS_TILE_KIND_SIZES[FD_METRICS_TILE_KIND_CNT] = {
FD_METRICS_REPLAY_TOTAL,
FD_METRICS_STOREI_TOTAL,
FD_METRICS_GOSSIP_TOTAL,
+ FD_METRICS_NETLNK_TOTAL,
};
const fd_metrics_meta_t * FD_METRICS_TILE_KIND_METRICS[FD_METRICS_TILE_KIND_CNT] = {
FD_METRICS_NET,
@@ -80,4 +82,5 @@ const fd_metrics_meta_t * FD_METRICS_TILE_KIND_METRICS[FD_METRICS_TILE_KIND_CNT]
FD_METRICS_REPLAY,
FD_METRICS_STOREI,
FD_METRICS_GOSSIP,
+ FD_METRICS_NETLNK,
};
diff --git a/src/disco/metrics/generated/fd_metrics_all.h b/src/disco/metrics/generated/fd_metrics_all.h
index 6a56847481..5cab0b3bd0 100644
--- a/src/disco/metrics/generated/fd_metrics_all.h
+++ b/src/disco/metrics/generated/fd_metrics_all.h
@@ -15,6 +15,7 @@
#include "fd_metrics_replay.h"
#include "fd_metrics_storei.h"
#include "fd_metrics_gossip.h"
+#include "fd_metrics_netlnk.h"
/* Start of LINK OUT metrics */
#define FD_METRICS_COUNTER_LINK_SLOW_COUNT_OFF (0UL)
@@ -151,7 +152,7 @@ extern const fd_metrics_meta_t FD_METRICS_ALL_LINK_OUT[FD_METRICS_ALL_LINK_OUT_T
#define FD_METRICS_TOTAL_SZ (8UL*222UL)
-#define FD_METRICS_TILE_KIND_CNT 13
+#define FD_METRICS_TILE_KIND_CNT 14
extern const char * FD_METRICS_TILE_KIND_NAMES[FD_METRICS_TILE_KIND_CNT];
extern const ulong FD_METRICS_TILE_KIND_SIZES[FD_METRICS_TILE_KIND_CNT];
extern const fd_metrics_meta_t * FD_METRICS_TILE_KIND_METRICS[FD_METRICS_TILE_KIND_CNT];
diff --git a/src/disco/metrics/generated/fd_metrics_enums.h b/src/disco/metrics/generated/fd_metrics_enums.h
index 48d935e817..b2ca8f9264 100644
--- a/src/disco/metrics/generated/fd_metrics_enums.h
+++ b/src/disco/metrics/generated/fd_metrics_enums.h
@@ -466,3 +466,17 @@
#define FD_METRICS_ENUM_MAKE_PRUNE_EVENT_V_ENCODING_FAILED_IDX 2
#define FD_METRICS_ENUM_MAKE_PRUNE_EVENT_V_ENCODING_FAILED_NAME "encoding_failed"
+#define FD_METRICS_ENUM_NETLINK_MSG_NAME "netlink_msg"
+#define FD_METRICS_ENUM_NETLINK_MSG_V_LINK_IDX 0
+#define FD_METRICS_ENUM_NETLINK_MSG_V_LINK_NAME "link"
+#define FD_METRICS_ENUM_NETLINK_MSG_V_NEIGH_IDX 1
+#define FD_METRICS_ENUM_NETLINK_MSG_V_NEIGH_NAME "neigh"
+#define FD_METRICS_ENUM_NETLINK_MSG_V_IPV4_ROUTE_IDX 2
+#define FD_METRICS_ENUM_NETLINK_MSG_V_IPV4_ROUTE_NAME "ipv4_route"
+
+#define FD_METRICS_ENUM_ROUTE_TABLE_NAME "route_table"
+#define FD_METRICS_ENUM_ROUTE_TABLE_V_LOCAL_IDX 0
+#define FD_METRICS_ENUM_ROUTE_TABLE_V_LOCAL_NAME "local"
+#define FD_METRICS_ENUM_ROUTE_TABLE_V_MAIN_IDX 1
+#define FD_METRICS_ENUM_ROUTE_TABLE_V_MAIN_NAME "main"
+
diff --git a/src/disco/metrics/generated/fd_metrics_netlnk.c b/src/disco/metrics/generated/fd_metrics_netlnk.c
new file mode 100644
index 0000000000..a401144e9f
--- /dev/null
+++ b/src/disco/metrics/generated/fd_metrics_netlnk.c
@@ -0,0 +1,16 @@
+/* THIS FILE IS GENERATED BY gen_metrics.py. DO NOT HAND EDIT. */
+#include "fd_metrics_netlnk.h"
+
+const fd_metrics_meta_t FD_METRICS_NETLNK[FD_METRICS_NETLNK_TOTAL] = {
+ DECLARE_METRIC( NETLNK_DROP_EVENTS, COUNTER ),
+ DECLARE_METRIC( NETLNK_LINK_FULL_SYNCS, COUNTER ),
+ DECLARE_METRIC( NETLNK_ROUTE_FULL_SYNCS, COUNTER ),
+ DECLARE_METRIC_ENUM( NETLNK_UPDATES, COUNTER, NETLINK_MSG, LINK ),
+ DECLARE_METRIC_ENUM( NETLNK_UPDATES, COUNTER, NETLINK_MSG, NEIGH ),
+ DECLARE_METRIC_ENUM( NETLNK_UPDATES, COUNTER, NETLINK_MSG, IPV4_ROUTE ),
+ DECLARE_METRIC( NETLNK_INTERFACE_COUNT, GAUGE ),
+ DECLARE_METRIC_ENUM( NETLNK_ROUTE_COUNT, GAUGE, ROUTE_TABLE, LOCAL ),
+ DECLARE_METRIC_ENUM( NETLNK_ROUTE_COUNT, GAUGE, ROUTE_TABLE, MAIN ),
+ DECLARE_METRIC( NETLNK_NEIGHBOR_SOLICITS_SENT, COUNTER ),
+ DECLARE_METRIC( NETLNK_NEIGHBOR_SOLICITS_FAILS, COUNTER ),
+};
diff --git a/src/disco/metrics/generated/fd_metrics_netlnk.h b/src/disco/metrics/generated/fd_metrics_netlnk.h
new file mode 100644
index 0000000000..6eda56d6f1
--- /dev/null
+++ b/src/disco/metrics/generated/fd_metrics_netlnk.h
@@ -0,0 +1,64 @@
+/* THIS FILE IS GENERATED BY gen_metrics.py. DO NOT HAND EDIT. */
+
+#include "../fd_metrics_base.h"
+#include "fd_metrics_enums.h"
+
+#define FD_METRICS_COUNTER_NETLNK_DROP_EVENTS_OFF (16UL)
+#define FD_METRICS_COUNTER_NETLNK_DROP_EVENTS_NAME "netlnk_drop_events"
+#define FD_METRICS_COUNTER_NETLNK_DROP_EVENTS_TYPE (FD_METRICS_TYPE_COUNTER)
+#define FD_METRICS_COUNTER_NETLNK_DROP_EVENTS_DESC "Number of netlink drop events caught"
+#define FD_METRICS_COUNTER_NETLNK_DROP_EVENTS_CVT (FD_METRICS_CONVERTER_NONE)
+
+#define FD_METRICS_COUNTER_NETLNK_LINK_FULL_SYNCS_OFF (17UL)
+#define FD_METRICS_COUNTER_NETLNK_LINK_FULL_SYNCS_NAME "netlnk_link_full_syncs"
+#define FD_METRICS_COUNTER_NETLNK_LINK_FULL_SYNCS_TYPE (FD_METRICS_TYPE_COUNTER)
+#define FD_METRICS_COUNTER_NETLNK_LINK_FULL_SYNCS_DESC "Number of full link table syncs done"
+#define FD_METRICS_COUNTER_NETLNK_LINK_FULL_SYNCS_CVT (FD_METRICS_CONVERTER_NONE)
+
+#define FD_METRICS_COUNTER_NETLNK_ROUTE_FULL_SYNCS_OFF (18UL)
+#define FD_METRICS_COUNTER_NETLNK_ROUTE_FULL_SYNCS_NAME "netlnk_route_full_syncs"
+#define FD_METRICS_COUNTER_NETLNK_ROUTE_FULL_SYNCS_TYPE (FD_METRICS_TYPE_COUNTER)
+#define FD_METRICS_COUNTER_NETLNK_ROUTE_FULL_SYNCS_DESC "Number of full route table syncs done"
+#define FD_METRICS_COUNTER_NETLNK_ROUTE_FULL_SYNCS_CVT (FD_METRICS_CONVERTER_NONE)
+
+#define FD_METRICS_COUNTER_NETLNK_UPDATES_OFF (19UL)
+#define FD_METRICS_COUNTER_NETLNK_UPDATES_NAME "netlnk_updates"
+#define FD_METRICS_COUNTER_NETLNK_UPDATES_TYPE (FD_METRICS_TYPE_COUNTER)
+#define FD_METRICS_COUNTER_NETLNK_UPDATES_DESC "Number of netlink live updates processed"
+#define FD_METRICS_COUNTER_NETLNK_UPDATES_CVT (FD_METRICS_CONVERTER_NONE)
+#define FD_METRICS_COUNTER_NETLNK_UPDATES_CNT (3UL)
+
+#define FD_METRICS_COUNTER_NETLNK_UPDATES_LINK_OFF (19UL)
+#define FD_METRICS_COUNTER_NETLNK_UPDATES_NEIGH_OFF (20UL)
+#define FD_METRICS_COUNTER_NETLNK_UPDATES_IPV4_ROUTE_OFF (21UL)
+
+#define FD_METRICS_GAUGE_NETLNK_INTERFACE_COUNT_OFF (22UL)
+#define FD_METRICS_GAUGE_NETLNK_INTERFACE_COUNT_NAME "netlnk_interface_count"
+#define FD_METRICS_GAUGE_NETLNK_INTERFACE_COUNT_TYPE (FD_METRICS_TYPE_GAUGE)
+#define FD_METRICS_GAUGE_NETLNK_INTERFACE_COUNT_DESC "Number of network interfaces"
+#define FD_METRICS_GAUGE_NETLNK_INTERFACE_COUNT_CVT (FD_METRICS_CONVERTER_NONE)
+
+#define FD_METRICS_GAUGE_NETLNK_ROUTE_COUNT_OFF (23UL)
+#define FD_METRICS_GAUGE_NETLNK_ROUTE_COUNT_NAME "netlnk_route_count"
+#define FD_METRICS_GAUGE_NETLNK_ROUTE_COUNT_TYPE (FD_METRICS_TYPE_GAUGE)
+#define FD_METRICS_GAUGE_NETLNK_ROUTE_COUNT_DESC "Number of IPv4 routes"
+#define FD_METRICS_GAUGE_NETLNK_ROUTE_COUNT_CVT (FD_METRICS_CONVERTER_NONE)
+#define FD_METRICS_GAUGE_NETLNK_ROUTE_COUNT_CNT (2UL)
+
+#define FD_METRICS_GAUGE_NETLNK_ROUTE_COUNT_LOCAL_OFF (23UL)
+#define FD_METRICS_GAUGE_NETLNK_ROUTE_COUNT_MAIN_OFF (24UL)
+
+#define FD_METRICS_COUNTER_NETLNK_NEIGHBOR_SOLICITS_SENT_OFF (25UL)
+#define FD_METRICS_COUNTER_NETLNK_NEIGHBOR_SOLICITS_SENT_NAME "netlnk_neighbor_solicits_sent"
+#define FD_METRICS_COUNTER_NETLNK_NEIGHBOR_SOLICITS_SENT_TYPE (FD_METRICS_TYPE_COUNTER)
+#define FD_METRICS_COUNTER_NETLNK_NEIGHBOR_SOLICITS_SENT_DESC "Number of neighbor solicit requests sent to kernel"
+#define FD_METRICS_COUNTER_NETLNK_NEIGHBOR_SOLICITS_SENT_CVT (FD_METRICS_CONVERTER_NONE)
+
+#define FD_METRICS_COUNTER_NETLNK_NEIGHBOR_SOLICITS_FAILS_OFF (26UL)
+#define FD_METRICS_COUNTER_NETLNK_NEIGHBOR_SOLICITS_FAILS_NAME "netlnk_neighbor_solicits_fails"
+#define FD_METRICS_COUNTER_NETLNK_NEIGHBOR_SOLICITS_FAILS_TYPE (FD_METRICS_TYPE_COUNTER)
+#define FD_METRICS_COUNTER_NETLNK_NEIGHBOR_SOLICITS_FAILS_DESC "Number of neighbor solicit requests that failed to send"
+#define FD_METRICS_COUNTER_NETLNK_NEIGHBOR_SOLICITS_FAILS_CVT (FD_METRICS_CONVERTER_NONE)
+
+#define FD_METRICS_NETLNK_TOTAL (11UL)
+extern const fd_metrics_meta_t FD_METRICS_NETLNK[FD_METRICS_NETLNK_TOTAL];
diff --git a/src/disco/metrics/metrics.xml b/src/disco/metrics/metrics.xml
index ac51fe12c7..6c47172f41 100644
--- a/src/disco/metrics/metrics.xml
+++ b/src/disco/metrics/metrics.xml
@@ -588,4 +588,28 @@ metric introduced.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/disco/netlink/Local.mk b/src/disco/netlink/Local.mk
new file mode 100644
index 0000000000..c1e78d7145
--- /dev/null
+++ b/src/disco/netlink/Local.mk
@@ -0,0 +1,6 @@
+ifdef FD_HAS_LINUX
+ifdef FD_HAS_SSE
+$(call add-hdrs,fd_netlink_tile.h)
+$(call add-objs,fd_netlink_tile,fd_disco)
+endif
+endif
diff --git a/src/disco/netlink/fd_netlink_tile.c b/src/disco/netlink/fd_netlink_tile.c
new file mode 100644
index 0000000000..c8acbe6f16
--- /dev/null
+++ b/src/disco/netlink/fd_netlink_tile.c
@@ -0,0 +1,383 @@
+#include "fd_netlink_tile_private.h"
+#include "../topo/fd_topo.h"
+#include "../topo/fd_topob.h"
+#include "../topo/fd_pod_format.h"
+#include "generated/netlink_seccomp.h"
+#include "../metrics/fd_metrics.h"
+#include "../../waltz/ip/fd_fib4_netlink.h"
+#include "../../waltz/mib/fd_netdev_netlink.h"
+#include "../../waltz/neigh/fd_neigh4_netlink.h"
+#include "../../app/fdctl/config.h" /* FIXME inverse dependency */
+#include "../../util/log/fd_dtrace.h"
+
+#include
+#include
+#include /* SOL_{...} */
+#include /* getrandom */
+#include /* struct timeval */
+#include /* RTM_{...} */
+
+/* Hardcoded limits */
+#define NETDEV_MAX (256U)
+#define BOND_MASTER_MAX (256U)
+
+void
+fd_netlink_topo_create( fd_topo_tile_t * netlink_tile,
+ fd_topo_t * topo,
+ struct fdctl_config const * config ) {
+ fd_topo_obj_t * netdev_dbl_buf_obj = fd_topob_obj( topo, "dbl_buf", "netbase" );
+ fd_topo_obj_t * fib4_main_obj = fd_topob_obj( topo, "fib4", "netbase" );
+ fd_topo_obj_t * fib4_local_obj = fd_topob_obj( topo, "fib4", "netbase" );
+ fd_topo_obj_t * neigh4_obj = fd_topob_obj( topo, "neigh4_hmap", "netbase" );
+ fd_topo_obj_t * neigh4_ele_obj = fd_topob_obj( topo, "opaque", "netbase" );
+
+ fd_topob_tile_uses( topo, netlink_tile, netdev_dbl_buf_obj, FD_SHMEM_JOIN_MODE_READ_WRITE );
+ fd_topob_tile_uses( topo, netlink_tile, fib4_main_obj, FD_SHMEM_JOIN_MODE_READ_WRITE );
+ fd_topob_tile_uses( topo, netlink_tile, fib4_local_obj, FD_SHMEM_JOIN_MODE_READ_WRITE );
+ fd_topob_tile_uses( topo, netlink_tile, neigh4_obj, FD_SHMEM_JOIN_MODE_READ_WRITE );
+ fd_topob_tile_uses( topo, netlink_tile, neigh4_ele_obj, FD_SHMEM_JOIN_MODE_READ_WRITE );
+
+ /* Configure double buffer of netdev table */
+ ulong const netdev_dbl_buf_mtu = fd_netdev_tbl_footprint( NETDEV_MAX, BOND_MASTER_MAX );
+ FD_TEST( fd_pod_insertf_ulong( topo->props, netdev_dbl_buf_mtu, "obj.%lu.mtu", netdev_dbl_buf_obj->id ) );
+
+ /* Configure route table */
+ FD_TEST( fd_pod_insertf_ulong( topo->props, config->tiles.netlink.max_routes, "obj.%lu.route_max", fib4_main_obj->id ) );
+ FD_TEST( fd_pod_insertf_ulong( topo->props, config->tiles.netlink.max_routes, "obj.%lu.route_max", fib4_local_obj->id ) );
+
+ /* Configure neighbor hashmap: Open addressed hashmap with 3.0 sparsity
+ factor and 16 long probe chain */
+ uint const neigh_if_idx = if_nametoindex( config->tiles.net.interface );
+ if( FD_UNLIKELY( !neigh_if_idx ) ) FD_LOG_ERR(( "if_nametoindex(%s) failed (%i-%s)", config->tiles.net.interface, errno, fd_io_strerror( errno ) ));
+ ulong const neigh_ele_max = fd_ulong_pow2_up( 3UL * config->tiles.netlink.max_neighbors );
+ ulong const neigh_ele_align = alignof(fd_neigh4_entry_t);
+ ulong const neigh_ele_fp = neigh_ele_max * sizeof(fd_neigh4_entry_t);
+ FD_TEST( fd_pod_insertf_ulong( topo->props, neigh_ele_max, "obj.%lu.ele_max", neigh4_obj->id ) );
+ FD_TEST( fd_pod_insertf_ulong( topo->props, 16UL, "obj.%lu.probe_max", neigh4_obj->id ) );
+ FD_TEST( fd_pod_insertf_ulong( topo->props, 4UL, "obj.%lu.lock_cnt", neigh4_obj->id ) );
+ FD_TEST( fd_pod_insertf_ulong( topo->props, neigh_ele_align, "obj.%lu.align", neigh4_ele_obj->id ) );
+ FD_TEST( fd_pod_insertf_ulong( topo->props, neigh_ele_fp, "obj.%lu.footprint", neigh4_ele_obj->id ) );
+
+ /* Pick a random hashmap seed */
+ ulong seed;
+ FD_TEST( 8UL==getrandom( &seed, sizeof(ulong), 0 ) );
+ FD_TEST( fd_pod_insertf_ulong( topo->props, seed, "obj.%lu.seed", neigh4_obj->id ) );
+
+ netlink_tile->netlink.netdev_dbl_buf_obj_id = netdev_dbl_buf_obj->id;
+ netlink_tile->netlink.fib4_main_obj_id = fib4_main_obj->id;
+ netlink_tile->netlink.fib4_local_obj_id = fib4_local_obj->id;
+ netlink_tile->netlink.neigh_if_idx = neigh_if_idx;
+ netlink_tile->netlink.neigh4_obj_id = neigh4_obj->id;
+ netlink_tile->netlink.neigh4_ele_obj_id = neigh4_ele_obj->id;
+}
+
+void
+fd_netlink_topo_join( fd_topo_t * topo,
+ fd_topo_tile_t * netlink_tile,
+ fd_topo_tile_t * join_tile ) {
+ fd_topob_tile_uses( topo, join_tile, &topo->objs[ netlink_tile->netlink.neigh4_obj_id ], FD_SHMEM_JOIN_MODE_READ_ONLY );
+ fd_topob_tile_uses( topo, join_tile, &topo->objs[ netlink_tile->netlink.neigh4_ele_obj_id ], FD_SHMEM_JOIN_MODE_READ_ONLY );
+ fd_topob_tile_uses( topo, join_tile, &topo->objs[ netlink_tile->netlink.fib4_main_obj_id ], FD_SHMEM_JOIN_MODE_READ_ONLY );
+ fd_topob_tile_uses( topo, join_tile, &topo->objs[ netlink_tile->netlink.fib4_local_obj_id ], FD_SHMEM_JOIN_MODE_READ_ONLY );
+}
+
+/* Timing details:
+
+ Housekeeping is done every 97ms.
+ Socket receives block up to 43ms. */
+
+/* Begin tile methods */
+
+FD_FN_CONST static inline ulong
+scratch_align( void ) {
+ return fd_ulong_max( alignof(fd_netlink_tile_ctx_t), FD_NETDEV_TBL_ALIGN );
+}
+
+FD_FN_PURE static inline ulong
+scratch_footprint( fd_topo_tile_t const * tile ) {
+ (void)tile;
+ ulong l = FD_LAYOUT_INIT;
+ l = FD_LAYOUT_APPEND( l, alignof(fd_netlink_tile_ctx_t), sizeof(fd_netlink_tile_ctx_t) );
+ l = FD_LAYOUT_APPEND( l, fd_netdev_tbl_align(), fd_netdev_tbl_footprint( NETDEV_MAX, BOND_MASTER_MAX ) );
+ return FD_LAYOUT_FINI( l, scratch_align() );
+}
+
+static ulong
+populate_allowed_seccomp( fd_topo_t const * topo,
+ fd_topo_tile_t const * tile,
+ ulong out_cnt,
+ struct sock_filter * out ) {
+ fd_netlink_tile_ctx_t * ctx = fd_topo_obj_laddr( topo, tile->tile_obj_id );
+ FD_TEST( ctx->magic==FD_NETLINK_TILE_CTX_MAGIC );
+ populate_sock_filter_policy_netlink( out_cnt, out, (uint)fd_log_private_logfile_fd(), (uint)ctx->nl_monitor->fd, (uint)ctx->nl_req->fd );
+ return sock_filter_policy_netlink_instr_cnt;
+}
+
+static ulong
+populate_allowed_fds( fd_topo_t const * topo,
+ fd_topo_tile_t const * tile,
+ ulong out_fds_cnt,
+ int * out_fds ) {
+ fd_netlink_tile_ctx_t * ctx = fd_topo_obj_laddr( topo, tile->tile_obj_id );
+ FD_TEST( ctx->magic==FD_NETLINK_TILE_CTX_MAGIC );
+
+ if( FD_UNLIKELY( out_fds_cnt<4UL ) ) FD_LOG_ERR(( "out_fds_cnt too low (%lu)", out_fds_cnt ));
+
+ ulong out_cnt = 0UL;
+ out_fds[ out_cnt++ ] = 2; /* stderr */
+ if( FD_LIKELY( -1!=fd_log_private_logfile_fd() ) )
+ out_fds[ out_cnt++ ] = fd_log_private_logfile_fd(); /* logfile */
+ out_fds[ out_cnt++ ] = ctx->nl_monitor->fd;
+ out_fds[ out_cnt++ ] = ctx->nl_req->fd;
+ return out_cnt;
+}
+
+static void
+privileged_init( fd_topo_t * topo,
+ fd_topo_tile_t * tile ) {
+ if( FD_UNLIKELY( tile->kind_id!=0 ) ) {
+ FD_LOG_ERR(( "Topology contains more than one netlink tile" ));
+ }
+
+ fd_netlink_tile_ctx_t * ctx = fd_topo_obj_laddr( topo, tile->tile_obj_id );
+ /* FIXME zero memory? */
+ ctx->magic = FD_NETLINK_TILE_CTX_MAGIC;
+ ctx->neigh4_ifidx = tile->netlink.neigh_if_idx;
+
+ if( FD_UNLIKELY( !fd_netlink_init( ctx->nl_monitor, 1000U ) ) ) {
+ FD_LOG_ERR(( "Failed to connect to rtnetlink" ));
+ }
+ if( FD_UNLIKELY( !fd_netlink_init( ctx->nl_req, 9000000U ) ) ) {
+ FD_LOG_ERR(( "Failed to connect to rtnetlink" ));
+ }
+
+ union {
+ struct sockaddr sa;
+ struct sockaddr_nl sanl;
+ } sa;
+ sa.sanl = (struct sockaddr_nl) {
+ .nl_family = AF_NETLINK,
+ .nl_groups = RTMGRP_LINK | RTMGRP_NEIGH | RTMGRP_IPV4_ROUTE
+ };
+ if( FD_UNLIKELY( 0!=bind( ctx->nl_monitor->fd, &sa.sa, sizeof(struct sockaddr_nl) ) ) ) {
+ FD_LOG_ERR(( "bind(sock,RT_NETLINK,RTMGRP_{LINK,NEIGH,IPV4_ROUTE}) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
+ }
+
+ struct timeval tv = { .tv_usec = 43000, }; /* 43ms */
+ if( FD_UNLIKELY( 0!=setsockopt( ctx->nl_monitor->fd, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(struct timeval) ) ) ) {
+ FD_LOG_ERR(( "setsockopt(sock,SOL_SOCKET,SO_RCVTIMEO) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
+ }
+}
+
+static void
+unprivileged_init( fd_topo_t * topo,
+ fd_topo_tile_t * tile ) {
+ FD_SCRATCH_ALLOC_INIT( l, fd_topo_obj_laddr( topo, tile->tile_obj_id ) );
+ fd_netlink_tile_ctx_t * ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_netlink_tile_ctx_t), sizeof(fd_netlink_tile_ctx_t) );
+ FD_TEST( ctx->magic==FD_NETLINK_TILE_CTX_MAGIC );
+ ctx->netdev_sz = fd_netdev_tbl_footprint( NETDEV_MAX, BOND_MASTER_MAX );
+ ctx->netdev_local = FD_SCRATCH_ALLOC_APPEND( l, fd_netdev_tbl_align(), ctx->netdev_sz );
+
+ FD_TEST( tile->netlink.netdev_dbl_buf_obj_id );
+ FD_TEST( tile->netlink.neigh4_obj_id );
+ FD_TEST( tile->netlink.neigh4_ele_obj_id );
+ FD_TEST( tile->netlink.fib4_local_obj_id );
+ FD_TEST( tile->netlink.fib4_main_obj_id );
+
+ FD_TEST( fd_netdev_tbl_new( ctx->netdev_local, NETDEV_MAX, BOND_MASTER_MAX ) );
+ FD_TEST( fd_netdev_tbl_join( ctx->netdev_tbl, ctx->netdev_local ) );
+
+ FD_TEST( ctx->netdev_buf = fd_dbl_buf_join( fd_topo_obj_laddr( topo, tile->netlink.netdev_dbl_buf_obj_id ) ) );
+
+ FD_TEST( fd_neigh4_hmap_join( ctx->neigh4, fd_topo_obj_laddr( topo, tile->netlink.neigh4_obj_id ), fd_topo_obj_laddr( topo, tile->netlink.neigh4_ele_obj_id ) ) );
+ ctx->fib4_local = fd_fib4_join( fd_topo_obj_laddr( topo, tile->netlink.fib4_local_obj_id ) ); FD_TEST( ctx->fib4_local );
+ ctx->fib4_main = fd_fib4_join( fd_topo_obj_laddr( topo, tile->netlink.fib4_main_obj_id ) ); FD_TEST( ctx->fib4_main );
+
+ for( ulong i=0UL; iin_cnt; i++ ) {
+ fd_topo_link_t * link = &topo->links[ tile->in_link_id[ i ] ];
+ if( FD_UNLIKELY( link->mtu!=0UL ) ) FD_LOG_ERR(( "netlink solicit links must have an MTU of zero" ));
+ }
+
+ ctx->action |= FD_NET_TILE_ACTION_LINK_UPDATE;
+ ctx->action |= FD_NET_TILE_ACTION_ROUTE4_UPDATE;
+ ctx->action |= FD_NET_TILE_ACTION_NEIGH_UPDATE;
+
+ ctx->update_backoff = (long)( fd_tempo_tick_per_ns( NULL ) * 10e6 ); /* 10ms */
+}
+
+/* Begin stem methods
+
+ Note: Using stem here might seem odd since fd_netlink_tile does not
+ send or receive any messages. Use of stem here is justified because of
+ the initialization, generic metrics, and event loop functionality it
+ provides. */
+
+static inline void
+metrics_write( fd_netlink_tile_ctx_t * ctx ) {
+ FD_MCNT_SET( NETLNK, DROP_EVENTS, fd_netlink_enobufs_cnt );
+ FD_MCNT_SET( NETLNK, LINK_FULL_SYNCS, ctx->metrics.link_full_syncs );
+ FD_MCNT_SET( NETLNK, ROUTE_FULL_SYNCS, ctx->metrics.route_full_syncs );
+ FD_MCNT_ENUM_COPY( NETLNK, UPDATES, ctx->metrics.update_cnt );
+ FD_MGAUGE_SET( NETLNK, INTERFACE_COUNT, ctx->netdev_tbl->hdr->dev_cnt );
+ FD_MGAUGE_SET( NETLNK, ROUTE_COUNT_LOCAL, fd_fib4_cnt( ctx->fib4_local ) );
+ FD_MGAUGE_SET( NETLNK, ROUTE_COUNT_MAIN, fd_fib4_cnt( ctx->fib4_main ) );
+ FD_MCNT_SET( NETLNK, NEIGHBOR_SOLICITS_SENT, ctx->metrics.neigh_solicits_sent );
+ FD_MCNT_SET( NETLNK, NEIGHBOR_SOLICITS_FAILS, ctx->metrics.neigh_solicits_fails );
+}
+
+static inline void
+during_housekeeping( fd_netlink_tile_ctx_t * ctx ) {
+ long now = fd_tickcount();
+ if( ctx->action & FD_NET_TILE_ACTION_LINK_UPDATE ) {
+ if( now < ctx->link_update_ts ) return;
+ ctx->action &= ~FD_NET_TILE_ACTION_LINK_UPDATE;
+ fd_netdev_netlink_load_table( ctx->netdev_tbl, ctx->nl_req );
+ fd_dbl_buf_insert( ctx->netdev_buf, ctx->netdev_local, ctx->netdev_sz );
+ ctx->link_update_ts = now+ctx->update_backoff;
+ ctx->metrics.link_full_syncs++;
+ }
+ if( ctx->action & FD_NET_TILE_ACTION_ROUTE4_UPDATE ) {
+ if( now < ctx->route4_update_ts ) return;
+ ctx->action &= ~FD_NET_TILE_ACTION_ROUTE4_UPDATE;
+ fd_fib4_netlink_load_table( ctx->fib4_local, ctx->nl_req, RT_TABLE_LOCAL );
+ fd_fib4_netlink_load_table( ctx->fib4_main, ctx->nl_req, RT_TABLE_MAIN );
+ ctx->route4_update_ts = now+ctx->update_backoff;
+ ctx->metrics.route_full_syncs++;
+ }
+ if( ctx->action & FD_NET_TILE_ACTION_NEIGH_UPDATE ) {
+ ctx->action &= ~FD_NET_TILE_ACTION_NEIGH_UPDATE;
+ fd_neigh4_netlink_request_dump( ctx->nl_req, ctx->neigh4_ifidx );
+ uchar buf[ 4096 ];
+ fd_netlink_iter_t iter[1];
+ for( fd_netlink_iter_init( iter, ctx->nl_req, buf, sizeof(buf) );
+ !fd_netlink_iter_done( iter );
+ fd_netlink_iter_next( iter, ctx->nl_req ) ) {
+ fd_neigh4_netlink_ingest_message( ctx->neigh4, fd_netlink_iter_msg( iter ), ctx->neigh4_ifidx );
+ }
+ }
+}
+
+static inline void
+before_credit( fd_netlink_tile_ctx_t * ctx,
+ fd_stem_context_t * stem,
+ int * charge_busy ) {
+ (void)stem;
+
+ uchar msg[ 16384 ];
+ long msg_sz = recvfrom( ctx->nl_monitor->fd, msg, sizeof(msg), 0, NULL, NULL );
+ if( msg_sz<=0L ) return;
+
+ /* FIXME the reported busy% should not include any wait time */
+ *charge_busy = 1;
+
+ struct nlmsghdr * nlh = fd_type_pun( msg );
+ FD_DTRACE_PROBE_4( netlink_update, nlh->nlmsg_seq, nlh->nlmsg_type, nlh->nlmsg_len, nlh->nlmsg_flags );
+ switch( nlh->nlmsg_type ) {
+ case RTM_NEWLINK:
+ case RTM_DELLINK:
+ ctx->action |= FD_NET_TILE_ACTION_LINK_UPDATE;
+ ctx->metrics.update_cnt[ FD_METRICS_ENUM_NETLINK_MSG_V_LINK_IDX ]++;
+ break;
+ case RTM_NEWROUTE:
+ case RTM_DELROUTE:
+ ctx->action |= FD_NET_TILE_ACTION_ROUTE4_UPDATE;
+ ctx->metrics.update_cnt[ FD_METRICS_ENUM_NETLINK_MSG_V_IPV4_ROUTE_IDX ]++;
+ break;
+ case RTM_NEWNEIGH:
+ case RTM_DELNEIGH: {
+ fd_neigh4_netlink_ingest_message( ctx->neigh4, nlh, ctx->neigh4_ifidx );
+ ctx->metrics.update_cnt[ FD_METRICS_ENUM_NETLINK_MSG_V_NEIGH_IDX ]++;
+ break;
+ }
+ default:
+ FD_LOG_INFO(( "Received unexpected netlink message type %u", nlh->nlmsg_type ));
+ break;
+ }
+
+}
+
+/* after_frag handles a neighbor solicit request */
+
+static void
+after_frag( fd_netlink_tile_ctx_t * ctx,
+ ulong in_idx,
+ ulong seq,
+ ulong sig,
+ ulong sz,
+ ulong tsorig,
+ fd_stem_context_t * stem ) {
+ (void)in_idx; (void)seq; (void)tsorig; (void)stem;
+
+ /* Parse request (fully contained in sig field) */
+
+ if( FD_UNLIKELY( sz!=0UL ) ) {
+ FD_LOG_WARNING(( "unexpected sz %lu", sz ));
+ }
+ if( FD_UNLIKELY( sig>>48 ) ) {
+ FD_LOG_WARNING(( "unexpected high bits in sig %016lx", sig ));
+ }
+ ushort if_idx = (ushort)(sig>>32);
+ uint ip4_addr = (uint)sig;
+ if( FD_UNLIKELY( if_idx!=ctx->neigh4_ifidx ) ) {
+ ctx->metrics.neigh_solicits_fails++;
+ return;
+ }
+
+ /* Drop if the kernel is already working on the request */
+
+ fd_neigh4_hmap_query_t query[1];
+ int spec_res = fd_neigh4_hmap_query_try( ctx->neigh4, &ip4_addr, NULL, query, 0 );
+ if( spec_res==FD_MAP_SUCCESS ) return;
+
+ /* Insert placeholder (take above branch next time) */
+
+ int prepare_res = fd_neigh4_hmap_prepare( ctx->neigh4, &ip4_addr, NULL, query, 0 );
+ if( FD_UNLIKELY( prepare_res!=FD_MAP_SUCCESS ) ) {
+ ctx->metrics.neigh_solicits_fails++;
+ return;
+ }
+ fd_neigh4_entry_t * ele = fd_neigh4_hmap_query_ele( query );
+ ele->state = FD_NEIGH4_STATE_INCOMPLETE;
+ ele->ip4_addr = ip4_addr;
+ memset( ele->mac_addr, 0, 6UL );
+ fd_neigh4_hmap_publish( query );
+
+ /* Trigger neighbor solicit via netlink */
+
+ int netlink_res = fd_neigh4_netlink_solicit( ctx->nl_req, if_idx, ip4_addr );
+ if( FD_UNLIKELY( netlink_res<0 ) ) {
+ ctx->metrics.neigh_solicits_fails++;
+ return;
+ }
+
+ ctx->metrics.neigh_solicits_sent++;
+
+}
+
+#define STEM_BURST (1UL)
+#define STEM_LAZY ((ulong)97e6) /* 97ms */
+
+#define STEM_CALLBACK_CONTEXT_TYPE fd_netlink_tile_ctx_t
+#define STEM_CALLBACK_CONTEXT_ALIGN alignof(fd_netlink_tile_ctx_t)
+
+#define STEM_CALLBACK_METRICS_WRITE metrics_write
+#define STEM_CALLBACK_DURING_HOUSEKEEPING during_housekeeping
+#define STEM_CALLBACK_BEFORE_CREDIT before_credit
+#define STEM_CALLBACK_AFTER_FRAG after_frag
+
+#include "../stem/fd_stem.c"
+
+/* End stem methods */
+
+fd_topo_run_tile_t fd_tile_netlink = {
+ .name = "netlnk",
+ .populate_allowed_seccomp = populate_allowed_seccomp,
+ .populate_allowed_fds = populate_allowed_fds,
+ .scratch_align = scratch_align,
+ .scratch_footprint = scratch_footprint,
+ .privileged_init = privileged_init,
+ .unprivileged_init = unprivileged_init,
+ .run = stem_run
+};
+
+/* FIXME handle ENOBUFS */
diff --git a/src/disco/netlink/fd_netlink_tile.h b/src/disco/netlink/fd_netlink_tile.h
new file mode 100644
index 0000000000..fe94bc5db1
--- /dev/null
+++ b/src/disco/netlink/fd_netlink_tile.h
@@ -0,0 +1,70 @@
+#ifndef HEADER_fd_src_disco_netlink_fd_netlink_tile_h
+#define HEADER_fd_src_disco_netlink_fd_netlink_tile_h
+
+/* fd_netlink_tile.h provides APIs for working with the netlink tile. */
+
+#include "../topo/fd_topo.h"
+
+/* fd_tile_netlink provides the netlink tile.
+
+ Consult /book/guide/netlink.md for more information.
+ Web mirror: https://docs.firedancer.io/guide/netlink.html */
+
+FD_PROTOTYPES_BEGIN
+extern fd_topo_run_tile_t fd_tile_netlink;
+FD_PROTOTYPES_END
+
+/* fd_netlink_neigh4_solicit_link_t holds information required to send
+ neighbor solicitation requests to the netlink tile. */
+
+struct fd_netlink_neigh4_solicit_link {
+ fd_frag_meta_t * mcache;
+ ulong depth;
+ ulong seq;
+};
+
+typedef struct fd_netlink_neigh4_solicit_link fd_netlink_neigh4_solicit_link_t;
+
+struct fdctl_config;
+
+FD_PROTOTYPES_BEGIN
+
+void
+fd_netlink_topo_create( fd_topo_tile_t * netlink_tile,
+ fd_topo_t * topo,
+ struct fdctl_config const * config );
+
+void
+fd_netlink_topo_join( fd_topo_t * topo,
+ fd_topo_tile_t * netlink_tile,
+ fd_topo_tile_t * join_tile );
+
+/* fd_netlink_neigh4_solicit{,_sse} requests a neighbor solicitation (i.e.
+ ARP request) for an IPv4 address. Safe to call at a high rate. The
+ netlink tile will deduplicate requests. ip4_addr is big endian. */
+
+static inline void
+fd_netlink_neigh4_solicit( fd_netlink_neigh4_solicit_link_t * link,
+ uint ip4_addr,
+ ulong tspub_comp ) {
+ ulong seq = link->seq;
+ ulong sig = ip4_addr;
+ fd_mcache_publish( link->mcache, link->depth, seq, sig, 0UL, 0UL, 0UL, 0UL, tspub_comp );
+ link->seq = fd_seq_inc( seq, 1UL );
+}
+
+#if FD_HAS_SSE
+static inline void
+fd_netlink_neigh4_solicit_sse( fd_netlink_neigh4_solicit_link_t * link,
+ uint ip4_addr,
+ ulong tspub_comp ) {
+ ulong seq = link->seq;
+ ulong sig = ip4_addr;
+ fd_mcache_publish_sse( link->mcache, link->depth, seq, sig, 0UL, 0UL, 0UL, 0UL, tspub_comp );
+ link->seq = fd_seq_inc( seq, 1UL );
+}
+#endif /* FD_HAS_SSE */
+
+FD_PROTOTYPES_END
+
+#endif /* HEADER_fd_src_disco_netlink_fd_netlink_tile_h */
diff --git a/src/disco/netlink/fd_netlink_tile_private.h b/src/disco/netlink/fd_netlink_tile_private.h
new file mode 100644
index 0000000000..ebbe806314
--- /dev/null
+++ b/src/disco/netlink/fd_netlink_tile_private.h
@@ -0,0 +1,58 @@
+#ifndef HEADER_fd_src_disco_netlink_fd_netlink_tile_private_h
+#define HEADER_fd_src_disco_netlink_fd_netlink_tile_private_h
+
+#include "../../waltz/ip/fd_netlink1.h"
+#include "../metrics/generated/fd_metrics_netlnk.h"
+#include "../../waltz/ip/fd_fib4.h"
+#include "../../waltz/mib/fd_dbl_buf.h"
+#include "../../waltz/mib/fd_netdev_tbl.h"
+#include "../../waltz/neigh/fd_neigh4_map.h"
+
+/* FD_NETLINK_TILE_CTX_MAGIC uniquely identifies a fd_netlink_tile_ctx_t.
+ CHange this whenever the fd_netlink_tile_ctx_t struct changes. */
+
+#define FD_NETLINK_TILE_CTX_MAGIC (0xec431bf97929c691UL) /* random */
+
+struct fd_netlink_tile_ctx {
+ ulong magic; /* ==FD_NETLINK_TILE_CTX_MAGIC */
+
+ fd_netlink_t nl_monitor[1];
+ fd_netlink_t nl_req[1];
+
+ /* Pending actions */
+ ulong action;
+# define FD_NET_TILE_ACTION_ROUTE4_UPDATE (1UL<<0)
+# define FD_NET_TILE_ACTION_LINK_UPDATE (1UL<<1)
+# define FD_NET_TILE_ACTION_NEIGH_UPDATE (1UL<<2)
+
+ /* Rate limit link and route table changes (in ticks) */
+ long update_backoff;
+ long route4_update_ts;
+ long link_update_ts;
+
+ /* Link table */
+ void * netdev_local; /* local mutable table */
+ ulong netdev_sz; /* size of netdev table */
+ fd_netdev_tbl_join_t netdev_tbl[1]; /* join to local mutable table */
+ fd_dbl_buf_t * netdev_buf; /* global immutable copy */
+
+ /* Route tables */
+ fd_fib4_t * fib4_local;
+ fd_fib4_t * fib4_main;
+
+ /* Neighbor table */
+ fd_neigh4_hmap_t neigh4[1];
+ uint neigh4_ifidx;
+
+ struct {
+ ulong link_full_syncs;
+ ulong route_full_syncs;
+ ulong update_cnt[ FD_METRICS_COUNTER_NETLNK_UPDATES_CNT ];
+ ulong neigh_solicits_sent;
+ ulong neigh_solicits_fails;
+ } metrics;
+};
+
+typedef struct fd_netlink_tile_ctx fd_netlink_tile_ctx_t;
+
+#endif /* HEADER_fd_src_disco_netlink_fd_netlink_tile_private_h */
diff --git a/src/disco/netlink/generated/netlink_seccomp.h b/src/disco/netlink/generated/netlink_seccomp.h
new file mode 100644
index 0000000000..369efe7009
--- /dev/null
+++ b/src/disco/netlink/generated/netlink_seccomp.h
@@ -0,0 +1,102 @@
+/* THIS FILE WAS GENERATED BY generate_filters.py. DO NOT EDIT BY HAND! */
+#ifndef HEADER_fd_src_disco_netlink_generated_netlink_seccomp_h
+#define HEADER_fd_src_disco_netlink_generated_netlink_seccomp_h
+
+#include "../../../../src/util/fd_util_base.h"
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#if defined(__i386__)
+# define ARCH_NR AUDIT_ARCH_I386
+#elif defined(__x86_64__)
+# define ARCH_NR AUDIT_ARCH_X86_64
+#elif defined(__aarch64__)
+# define ARCH_NR AUDIT_ARCH_AARCH64
+#else
+# error "Target architecture is unsupported by seccomp."
+#endif
+static const unsigned int sock_filter_policy_netlink_instr_cnt = 34;
+
+static void populate_sock_filter_policy_netlink( ulong out_cnt, struct sock_filter * out, unsigned int logfile_fd, unsigned int nl_mon_fd, unsigned int nl_req_fd) {
+ FD_TEST( out_cnt >= 34 );
+ struct sock_filter filter[34] = {
+ /* Check: Jump to RET_KILL_PROCESS if the script's arch != the runtime arch */
+ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, ( offsetof( struct seccomp_data, arch ) ) ),
+ BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, ARCH_NR, 0, /* RET_KILL_PROCESS */ 30 ),
+ /* loading syscall number in accumulator */
+ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, ( offsetof( struct seccomp_data, nr ) ) ),
+ /* allow write based on expression */
+ BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, SYS_write, /* check_write */ 4, 0 ),
+ /* allow fsync based on expression */
+ BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, SYS_fsync, /* check_fsync */ 7, 0 ),
+ /* allow sendto based on expression */
+ BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, SYS_sendto, /* check_sendto */ 8, 0 ),
+ /* allow recvfrom based on expression */
+ BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, SYS_recvfrom, /* check_recvfrom */ 15, 0 ),
+ /* none of the syscalls matched */
+ { BPF_JMP | BPF_JA, 0, 0, /* RET_KILL_PROCESS */ 24 },
+// check_write:
+ /* load syscall argument 0 in accumulator */
+ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[0])),
+ BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 2, /* RET_ALLOW */ 23, /* lbl_1 */ 0 ),
+// lbl_1:
+ /* load syscall argument 0 in accumulator */
+ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[0])),
+ BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, logfile_fd, /* RET_ALLOW */ 21, /* RET_KILL_PROCESS */ 20 ),
+// check_fsync:
+ /* load syscall argument 0 in accumulator */
+ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[0])),
+ BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, logfile_fd, /* RET_ALLOW */ 19, /* RET_KILL_PROCESS */ 18 ),
+// check_sendto:
+ /* load syscall argument 0 in accumulator */
+ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[0])),
+ BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, nl_req_fd, /* lbl_2 */ 0, /* RET_KILL_PROCESS */ 16 ),
+// lbl_2:
+ /* load syscall argument 3 in accumulator */
+ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[3])),
+ BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 0, /* lbl_3 */ 0, /* RET_KILL_PROCESS */ 14 ),
+// lbl_3:
+ /* load syscall argument 4 in accumulator */
+ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[4])),
+ BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 0, /* lbl_4 */ 0, /* RET_KILL_PROCESS */ 12 ),
+// lbl_4:
+ /* load syscall argument 5 in accumulator */
+ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[5])),
+ BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 0, /* RET_ALLOW */ 11, /* RET_KILL_PROCESS */ 10 ),
+// check_recvfrom:
+ /* load syscall argument 0 in accumulator */
+ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[0])),
+ BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, nl_mon_fd, /* lbl_5 */ 2, /* lbl_6 */ 0 ),
+// lbl_6:
+ /* load syscall argument 0 in accumulator */
+ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[0])),
+ BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, nl_req_fd, /* lbl_5 */ 0, /* RET_KILL_PROCESS */ 6 ),
+// lbl_5:
+ /* load syscall argument 3 in accumulator */
+ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[3])),
+ BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 0, /* lbl_7 */ 0, /* RET_KILL_PROCESS */ 4 ),
+// lbl_7:
+ /* load syscall argument 4 in accumulator */
+ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[4])),
+ BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 0, /* lbl_8 */ 0, /* RET_KILL_PROCESS */ 2 ),
+// lbl_8:
+ /* load syscall argument 5 in accumulator */
+ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[5])),
+ BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 0, /* RET_ALLOW */ 1, /* RET_KILL_PROCESS */ 0 ),
+// RET_KILL_PROCESS:
+ /* KILL_PROCESS is placed before ALLOW since it's the fallthrough case. */
+ BPF_STMT( BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS ),
+// RET_ALLOW:
+ /* ALLOW has to be reached by jumping */
+ BPF_STMT( BPF_RET | BPF_K, SECCOMP_RET_ALLOW ),
+ };
+ fd_memcpy( out, filter, sizeof( filter ) );
+}
+
+#endif
diff --git a/src/disco/netlink/netlink.seccomppolicy b/src/disco/netlink/netlink.seccomppolicy
new file mode 100644
index 0000000000..4269886c49
--- /dev/null
+++ b/src/disco/netlink/netlink.seccomppolicy
@@ -0,0 +1,37 @@
+# logfile_fd: It can be disabled by configuration, but typically tiles
+# will open a log file on boot and write all messages there.
+#
+# nl_mon_fd: An rtnetlink socket used to monitor updates
+# nl_req_fd: An rtnetlink socket used for request-reply
+unsigned int logfile_fd, unsigned int nl_mon_fd, unsigned int nl_req_fd
+
+# logging: all log messages are written to a file and/or pipe
+#
+# 'WARNING' and above are written to the STDERR pipe, while all messages
+# are always written to the log file.
+#
+# arg 0 is the file descriptor to write to. The boot process ensures
+# that descriptor 2 is always STDERR and descriptor 4 is the logfile.
+write: (or (eq (arg 0) 2)
+ (eq (arg 0) logfile_fd))
+
+# logging: 'WARNING' and above fsync the logfile to disk immediately
+#
+# arg 0 is the file descriptor to fsync. The boot process ensures that
+# descriptor 3 is always the logfile.
+fsync: (eq (arg 0) logfile_fd)
+
+# sendto(2) is used to send netlink requests to the kernel
+# (In theory could use send(2) but that syscall doesn't exist on arm64)
+sendto: (and (eq (arg 0) nl_req_fd)
+ (eq (arg 3) 0)
+ (eq (arg 4) 0)
+ (eq (arg 5) 0))
+
+# recvfrom(2) is used to receive netlink responses from the kernel
+# (Using recvfrom(2) instead of recv(2) for same ABI reasons as above)
+recvfrom: (and (or (eq (arg 0) nl_mon_fd)
+ (eq (arg 0) nl_req_fd))
+ (eq (arg 3) 0)
+ (eq (arg 4) 0)
+ (eq (arg 5) 0))
diff --git a/src/disco/topo/fd_topo.h b/src/disco/topo/fd_topo.h
index 69977878c4..ae954d522d 100644
--- a/src/disco/topo/fd_topo.h
+++ b/src/disco/topo/fd_topo.h
@@ -141,12 +141,17 @@ typedef struct {
ushort gossip_listen_port;
ushort repair_intake_listen_port;
ushort repair_serve_listen_port;
-
- /* multihoming support */
- ulong multihome_ip_addrs_cnt;
- uint multihome_ip_addrs[FD_NET_MAX_SRC_ADDR];
} net;
+ struct {
+ ulong netdev_dbl_buf_obj_id; /* dbl_buf containing netdev_tbl */
+ ulong fib4_main_obj_id; /* fib4 containing main route table */
+ ulong fib4_local_obj_id; /* fib4 containing local route table */
+ uint neigh_if_idx; /* neigh4 interface index */
+ ulong neigh4_obj_id; /* neigh4 hash map header */
+ ulong neigh4_ele_obj_id; /* neigh4 hash map slots */
+ } netlink;
+
struct {
uint out_depth;
uint reasm_cnt;
diff --git a/src/disco/topo/fd_topob.c b/src/disco/topo/fd_topob.c
index e29ee8c7a3..00453ca964 100644
--- a/src/disco/topo/fd_topob.c
+++ b/src/disco/topo/fd_topob.c
@@ -322,6 +322,7 @@ fd_topob_auto_layout( fd_topo_t * topo ) {
which should be floating. */
char const * FLOATING[] = {
+ "netlnk",
"metric",
"cswtch",
"bencho",
diff --git a/src/waltz/ip/Local.mk b/src/waltz/ip/Local.mk
index 5becc29df0..ef5c8d4e32 100644
--- a/src/waltz/ip/Local.mk
+++ b/src/waltz/ip/Local.mk
@@ -14,3 +14,13 @@ $(call run-unit-test,test_netlink)
$(call run-unit-test,test_routing)
endif
endif
+
+$(call add-hdrs,fd_fib4.h)
+$(call add-objs,fd_fib4,fd_waltz)
+ifdef FD_HAS_LINUX
+$(call add-objs,fd_netlink1 fd_fib4_netlink,fd_waltz)
+$(call make-unit-test,test_fib4_netlink,test_fib4_netlink,fd_waltz fd_util)
+$(call run-unit-test,test_fib4_netlink)
+endif
+$(call make-unit-test,test_fib4,test_fib4,fd_waltz fd_util)
+$(call run-unit-test,test_fib4)
diff --git a/src/waltz/ip/fd_fib4.c b/src/waltz/ip/fd_fib4.c
new file mode 100644
index 0000000000..4e21c18631
--- /dev/null
+++ b/src/waltz/ip/fd_fib4.c
@@ -0,0 +1,318 @@
+#include "fd_fib4.h"
+#include "fd_fib4_private.h"
+#include "../../util/fd_util.h"
+
+/* FIXME this implementation is not completely robust against torn reads */
+
+FD_FN_CONST ulong
+fd_fib4_align( void ) {
+ return alignof(fd_fib4_t);
+}
+
+FD_FN_CONST ulong
+fd_fib4_footprint( ulong route_max ) {
+ if( route_max==0 || route_max>UINT_MAX ) return 0UL;
+ return FD_LAYOUT_FINI( FD_LAYOUT_APPEND( FD_LAYOUT_APPEND( FD_LAYOUT_APPEND( FD_LAYOUT_INIT,
+ alignof(fd_fib4_t), sizeof(fd_fib4_t) ),
+ alignof(fd_fib4_key_t), route_max*sizeof(fd_fib4_key_t) ),
+ alignof(fd_fib4_hop_t), route_max*sizeof(fd_fib4_hop_t) ),
+ alignof(fd_fib4_t) );
+}
+
+void *
+fd_fib4_new( void * mem,
+ ulong route_max ) {
+
+ if( FD_UNLIKELY( !mem ) ) {
+ FD_LOG_WARNING(( "NULL mem" ));
+ return NULL;
+ }
+ if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)mem, fd_fib4_align() ) ) ) {
+ FD_LOG_WARNING(( "unaligned mem" ));
+ return NULL;
+ }
+ if( FD_UNLIKELY( route_max==0 || route_max>UINT_MAX ) ) {
+ FD_LOG_WARNING(( "invalid route_max" ));
+ return NULL;
+ }
+
+ FD_SCRATCH_ALLOC_INIT( l, mem );
+ fd_fib4_t * fib4 = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_fib4_t), sizeof(fd_fib4_t) );
+ fd_fib4_key_t * keys = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_fib4_key_t), route_max*sizeof(fd_fib4_key_t) );
+ fd_fib4_hop_t * vals = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_fib4_hop_t), route_max*sizeof(fd_fib4_hop_t) );
+ FD_SCRATCH_ALLOC_FINI( l, alignof(fd_fib4_t) );
+
+ fd_memset( fib4, 0, sizeof(fd_fib4_t) );
+ fd_memset( keys, 0, route_max*sizeof(fd_fib4_key_t) );
+ fd_memset( vals, 0, route_max*sizeof(fd_fib4_hop_t) );
+ fib4->max = (uint)route_max;
+ fib4->hop_off = (ulong)vals - (ulong)fib4;
+ keys[0].prio = UINT_MAX;
+
+ fd_fib4_clear( fib4 );
+
+ return fib4;
+}
+
+fd_fib4_t *
+fd_fib4_join( void * mem ) {
+ return (fd_fib4_t *)mem;
+}
+
+void *
+fd_fib4_leave( fd_fib4_t * fib4 ) {
+ return fib4;
+}
+
+void *
+fd_fib4_delete( void * mem ) {
+ return mem;
+}
+
+void
+fd_fib4_clear( fd_fib4_t * fib4 ) {
+
+ /* Step 1: Make default route negative */
+
+ fd_fib4_hop_tbl( fib4 )->rtype = FD_FIB4_RTYPE_BLACKHOLE;
+ FD_COMPILER_MFENCE();
+
+ /* Step 2: Disable all other routes */
+
+ fib4->active_cnt = 1U;
+ FD_COMPILER_MFENCE();
+
+ /* Step 3: Indicate we are mid write */
+
+ fib4->generation++;
+ FD_COMPILER_MFENCE();
+
+ /* Step 4: Update metadata */
+
+ fib4->generation++;
+ fib4->prepare_cnt = 1U;
+}
+
+FD_FN_PURE ulong
+fd_fib4_max( fd_fib4_t const * fib ) {
+ return fib->max;
+}
+
+FD_FN_PURE ulong
+fd_fib4_cnt( fd_fib4_t const * fib ) {
+ return fib->prepare_cnt ? fib->prepare_cnt : fib->active_cnt;
+}
+
+ulong
+fd_fib4_free_cnt( fd_fib4_t const * fib ) {
+ if( FD_UNLIKELY( fib->prepare_cnt==0 ) ) return 0UL;
+ if( FD_UNLIKELY( fib->prepare_cnt > fib->max ) ) FD_LOG_CRIT(( "prepare_cnt > max" ));
+ return fib->max - fib->prepare_cnt;
+}
+
+fd_fib4_hop_t *
+fd_fib4_append( fd_fib4_t * fib,
+ uint ip4_dst,
+ int prefix,
+ uint prio ) {
+ if( FD_UNLIKELY( fib->prepare_cnt>=fib->max ) ) {
+ FD_LOG_WARNING(( "Failed to insert route, route table is full (%u max)", fib->max ));
+ return NULL;
+ }
+ if( FD_UNLIKELY( fib->prepare_cnt==0 ) ) {
+ FD_LOG_WARNING(( "Attempted to write to fib4 without lock" ));
+ return NULL;
+ }
+
+ uint idx = fib->prepare_cnt;
+ fib->prepare_cnt = idx+1U;
+
+ fd_fib4_key_t * key = fd_fib4_key_tbl( fib ) + idx;
+ *key = (fd_fib4_key_t) {
+ .addr = fd_uint_bswap( ip4_dst ),
+ .mask = prefix>0 ? fd_uint_mask( 32-prefix, 31 ) : 0U,
+ .prio = prio
+ };
+ return fd_fib4_hop_tbl( fib ) + idx;
+}
+
+void
+fd_fib4_publish( fd_fib4_t * fib ) {
+
+ /* Step 1: Enable new routes */
+
+ fib->active_cnt = fib->prepare_cnt;
+ FD_COMPILER_MFENCE();
+
+ /* Step 2: Make default route neutral */
+
+ fd_fib4_hop_tbl( fib )->rtype = FD_FIB4_RTYPE_THROW;
+ FD_COMPILER_MFENCE();
+
+ /* Step 3: Indicate that write is complete */
+
+ fib->generation++;
+ FD_COMPILER_MFENCE();
+
+ /* Step 4: Update metadata */
+
+ fib->prepare_cnt = 0U;
+}
+
+fd_fib4_hop_t const *
+fd_fib4_lookup( fd_fib4_t const * fib,
+ fd_fib4_hop_t * out,
+ uint ip4_dst,
+ ulong flags ) {
+ if( FD_UNLIKELY( flags ) ) {
+ return fd_fib4_hop_tbl_const( fib ) + 0; /* dead route */
+ }
+ ip4_dst = fd_uint_bswap( ip4_dst );
+
+ ulong generation = fib->generation;
+ fd_fib4_key_t const * keys = fd_fib4_key_tbl_const( fib );
+ FD_COMPILER_MFENCE();
+
+ ulong best_idx = 0UL; /* dead route */
+ int best_mask = 32; /* least specific mask (/0) */
+ for( ulong j=0UL; j<(fib->active_cnt); j++ ) {
+ /* FIXME consider branch variant? */
+ int match = (ip4_dst & keys[j].mask)==keys[j].addr;
+ int mask_bits = fd_uint_find_lsb_w_default( keys[j].mask, 32 );
+ int more_specific = mask_bits< best_mask;
+ int less_costly = mask_bits==best_mask && keys[j].priogeneration!=generation ) ) {
+ return fd_fib4_hop_tbl_const( fib ) + 0; /* dead route */
+ }
+ return out;
+}
+
+#if FD_HAS_HOSTED
+
+#include
+#include
+#include "../../util/net/fd_ip4.h"
+
+#define WRAP_PRINT(file,str) if( FD_UNLIKELY( fputs( (str), (file) )<0 ) ) return errno
+#define WRAP_PRINTF(file,...) if( FD_UNLIKELY( fprintf( (file), __VA_ARGS__ )<0 ) ) return errno
+
+static int
+fd_fib4_fprintf_route( fd_fib4_key_t const * key,
+ fd_fib4_hop_t const * hop,
+ FILE * file ) {
+
+ switch( hop->rtype ) {
+ case FD_FIB4_RTYPE_UNSPEC:
+ WRAP_PRINT( file, "unspecified " );
+ break;
+ case FD_FIB4_RTYPE_UNICAST:
+ break;
+ case FD_FIB4_RTYPE_LOCAL:
+ WRAP_PRINT( file, "local " );
+ break;
+ case FD_FIB4_RTYPE_BROADCAST:
+ WRAP_PRINT( file, "broadcast " );
+ break;
+ case FD_FIB4_RTYPE_MULTICAST:
+ WRAP_PRINT( file, "multicast " );
+ break;
+ case FD_FIB4_RTYPE_BLACKHOLE:
+ WRAP_PRINT( file, "blackhole " );
+ break;
+ case FD_FIB4_RTYPE_THROW:
+ WRAP_PRINT( file, "throw " );
+ break;
+ default:
+ WRAP_PRINTF( file, "invalid (%u) ", hop->rtype );
+ break;
+ }
+
+ if( key->mask==0 ) {
+ WRAP_PRINT( file, "default" );
+ } else {
+ WRAP_PRINTF( file, FD_IP4_ADDR_FMT, FD_IP4_ADDR_FMT_ARGS( fd_uint_bswap( key->addr ) ) );
+ if( key->mask!=UINT_MAX ) {
+ WRAP_PRINTF( file, "/%u", 32U-(uint)fd_uint_find_lsb_w_default( key->mask, 32 ) );
+ }
+ }
+
+ if( hop->ip4_gw ) {
+ WRAP_PRINTF( file, " via " FD_IP4_ADDR_FMT, FD_IP4_ADDR_FMT_ARGS( hop->ip4_gw ) );
+ }
+
+ if( hop->if_idx ) {
+ WRAP_PRINTF( file, " dev %u", hop->if_idx );
+ }
+
+ switch( hop->scope ) {
+ case 0:
+ break;
+ case 200:
+ WRAP_PRINT( file, " scope site" );
+ break;
+ case 253:
+ WRAP_PRINT( file, " scope link" );
+ break;
+ case 254:
+ WRAP_PRINT( file, " scope host" );
+ break;
+ default:
+ WRAP_PRINTF( file, " scope %u", hop->scope );
+ break;
+ }
+
+ if( hop->ip4_src ) {
+ WRAP_PRINTF( file, " src " FD_IP4_ADDR_FMT, FD_IP4_ADDR_FMT_ARGS( hop->ip4_src ) );
+ }
+
+ if( key->prio ) {
+ WRAP_PRINTF( file, " metric %u", key->prio );
+ }
+
+ WRAP_PRINT( file, "\n" );
+
+ return 0;
+}
+
+int
+fd_fib4_fprintf( fd_fib4_t const * fib,
+ void * file_ ) {
+ FILE * file = file_;
+ fd_fib4_key_t const * key_tbl = fd_fib4_key_tbl_const( fib );
+ fd_fib4_hop_t const * hop_tbl = fd_fib4_hop_tbl_const( fib );
+
+ FD_COMPILER_MFENCE();
+ ulong active_cnt = fib->active_cnt;
+ ulong generation = fib->generation;
+ FD_COMPILER_MFENCE();
+
+ for( ulong j=0UL; jgeneration );
+ FD_COMPILER_MFENCE();
+ if( FD_UNLIKELY( cur_gen!=generation ) ) {
+ WRAP_PRINT( file, "=== TORN READ ===\n" );
+ return 0;
+ }
+ fd_fib4_fprintf_route( &key, &hop, file );
+ }
+
+ return 0;
+}
+
+#undef WRAP_PRINT
+#undef WRAP_PRINTF
+
+#endif /* FD_HAS_HOSTED */
diff --git a/src/waltz/ip/fd_fib4.h b/src/waltz/ip/fd_fib4.h
new file mode 100644
index 0000000000..e71a66a905
--- /dev/null
+++ b/src/waltz/ip/fd_fib4.h
@@ -0,0 +1,187 @@
+#ifndef HEADER_fd_src_waltz_ip_fd_fib4_h
+#define HEADER_fd_src_waltz_ip_fd_fib4_h
+
+/* A fib4 stores IPv4 routes in a query-optimized data structure.
+
+ fib4 does not scale well to large numbers of routes. Every route
+ lookup is O(n) where n is the number of routes in the FIB.
+
+ fib4 only supports a minimal set of features required for end devices
+ to operate. Packet forwarding is not supported.
+
+ fib4 supports multi-threaded operation in a x86-TSO like environment.
+ (many reader threads, one writer thread) Refer to each function for
+ thread safety.
+
+ A fib4 has two states: PREPARE and ACTIVE. In ACTIVE state, FIB lookups
+ function as expected but writes are prohibited. In PREPARE state, any
+ FIB lookup returns 'FD_FIB4_RTYPE_BLACKHOLE' but writes are allowed.
+
+ A fib4 always has a dummy route at index 0. In PREPARE state, this
+ route is a BLACKHOLE route (terminate routing and drops the packet),
+ otherwise it is a THROW route (continue routing with next table).
+
+ FIXME: CONSIDER TRIE BASED DATA STRUCTURE
+
+ Trivia: https://en.wikipedia.org/wiki/Forwarding_information_base */
+
+#include "../../util/fd_util_base.h"
+
+#define FD_FIB4_ALIGN (16UL)
+
+/* FD_FIB4_RTYPE_{...} enumerate route types.
+ These match Linux RTN_UNICAST, etc. */
+
+#define FD_FIB4_RTYPE_UNSPEC (0) /* invalid */
+#define FD_FIB4_RTYPE_UNICAST (1) /* "normal" path */
+#define FD_FIB4_RTYPE_LOCAL (2) /* address on local host */
+#define FD_FIB4_RTYPE_BROADCAST (3) /* reserved for future use */
+#define FD_FIB4_RTYPE_MULTICAST (5) /* reserved for future use */
+#define FD_FIB4_RTYPE_BLACKHOLE (6) /* drop packet */
+#define FD_FIB4_RTYPE_THROW (9) /* continue in next table */
+
+/* fd_fib4_t is a local handle to a fib4 object. Use fd_fib4_{align,
+ footprint,new,delete,join,leave} to construct and join a fib4. */
+
+struct fd_fib4;
+typedef struct fd_fib4 fd_fib4_t;
+
+/* fd_fib4_hop_t holds a FIB lookup result (see fd_fib4_lookup) */
+
+struct __attribute__((aligned(16))) fd_fib4_hop {
+ uint ip4_gw; /* gateway address (big endian) */
+ uint if_idx; /* output interface index */
+ uint ip4_src; /* override source address (big endian). 0 implies unset */
+ uchar rtype; /* route type (e.g. FD_FIB4_RTYPE_UNICAST) */
+ uchar scope; /* used to select source address */
+ uchar flags; /* app-specific flags */
+};
+
+#define FD_FIB4_FLAG_RTA_UNSUPPORTED ((uchar)0x01U) /* unsupported route attribute */
+#define FD_FIB4_FLAG_RTA_PARSE_ERR ((uchar)0x02U) /* failed to interpret route attribute */
+#define FD_FIB4_FLAG_RTYPE_UNSUPPORTED ((uchar)0x03U) /* unsupported route type */
+
+typedef struct fd_fib4_hop fd_fib4_hop_t;
+
+FD_PROTOTYPES_BEGIN
+
+/* Constructor APIs ******************************************************/
+
+FD_FN_CONST ulong
+fd_fib4_align( void );
+
+FD_FN_CONST ulong
+fd_fib4_footprint( ulong route_max );
+
+void *
+fd_fib4_new( void * mem,
+ ulong route_max );
+
+fd_fib4_t *
+fd_fib4_join( void * mem );
+
+void *
+fd_fib4_leave( fd_fib4_t * fib4 );
+
+void *
+fd_fib4_delete( void * mem );
+
+/* Write APIs *************************************************************
+
+ Currently, any updates to a fib4 require a full rewrite (incremental
+ updates are not supported). During an update, fd_fib4_lookup calls
+ temporarily return a route entry with FD_FIB4_RTYPE_BLACKHOLE, which
+ means outgoing packets get dropped. (This is preferable to potentially
+ making an incorrect routing decision based on a partial route table.)
+
+ Example usage:
+
+ fd_fib4_clear()
+ ... multiple calls to fd_fib4_append() ...
+ fd_fib4_publish() */
+
+/* fd_fib4_clear removes all route entries. Transitions the fib to PREPARE
+ state. */
+
+void
+fd_fib4_clear( fd_fib4_t * fib );
+
+/* fd_fib4_max returns the max number of routes in the table. */
+
+FD_FN_PURE ulong
+fd_fib4_max( fd_fib4_t const * fib );
+
+/* fd_fib4_cnt returns the number of routes in the table. In PREPARE state
+ returns the number of pending routes, in ACTIVE state returns the number
+ of active routes. */
+
+FD_FN_PURE ulong
+fd_fib4_cnt( fd_fib4_t const * fib );
+
+/* fd_fib4_free_cnt returns the number of fd_fib4_append calls that are
+ guaranteed to succeed, if fib is in PREPARE state. If fib is in ACTIVE
+ state returns 0. */
+
+FD_FN_PURE ulong
+fd_fib4_free_cnt( fd_fib4_t const * fib );
+
+/* fd_fib4_append attempts to add a new route entry. Assumes the fib is in
+ PREPARE state. If fd_fib4_free_cnt(fib) returned non-zero immediately
+ prior to calling append, then append is guaranteed to succeed.
+
+ Returns a hop object to be filled by the caller on success. On failure,
+ returns NULL and logs warning. Reasons for failure include no space
+ left or fib not in PREPARE state. */
+
+fd_fib4_hop_t *
+fd_fib4_append( fd_fib4_t * fib,
+ uint ip4_dst,
+ int prefix,
+ uint prio );
+
+/* fd_fib4_publish transitions the fib from PREPARE to ACTIVE state. If
+ the fib is already ACTIVE does nothing. */
+
+void
+fd_fib4_publish( fd_fib4_t * fib );
+
+/* Read APIs */
+
+/* fd_fib4_lookup resolves the next hop for an arbitrary IPv4 address.
+ If route was not found, retval->rtype is set to FD_FIB4_RTYPE_THROW.
+ If fib is not in ACTIVE state, retval->rtype is set to
+ FD_FIB4_RTYPE_BLACKHOLE.
+
+ Thread safe; Gracefully handles concurrent route updates by other
+ threads. */
+
+fd_fib4_hop_t const *
+fd_fib4_lookup( fd_fib4_t const * fib,
+ fd_fib4_hop_t * out,
+ uint ip4_dst,
+ ulong flags );
+
+/* fd_fib4_hop_or is a helper to chain together multiple FIB lookups. */
+
+FD_FN_PURE static inline fd_fib4_hop_t const *
+fd_fib4_hop_or( fd_fib4_hop_t const * left,
+ fd_fib4_hop_t const * right ) {
+ return left->rtype!=FD_FIB4_RTYPE_THROW ? left : right;
+}
+
+#if FD_HAS_HOSTED
+
+/* fd_fib4_fprintf prints the routing table to the given FILE * pointer (or
+ target equivalent). Order of routes is undefined but guaranteed to be
+ stable between calls. Outputs ASCII encoding with LF newlines. Returns
+ errno on failure and 0 on success. Only works on ACTIVE tables. */
+
+int
+fd_fib4_fprintf( fd_fib4_t const * fib,
+ void * file );
+
+#endif
+
+FD_PROTOTYPES_END
+
+#endif /* HEADER_fd_src_waltz_ip_fd_fib4_h */
diff --git a/src/waltz/ip/fd_fib4_netlink.c b/src/waltz/ip/fd_fib4_netlink.c
new file mode 100644
index 0000000000..40202191b5
--- /dev/null
+++ b/src/waltz/ip/fd_fib4_netlink.c
@@ -0,0 +1,264 @@
+#include "fd_fib4_netlink.h"
+#include "fd_fib4.h"
+#include "fd_netlink.h"
+
+#if !defined(__linux__)
+#error "fd_fib4_netlink.c requires a Linux system with kernel headers"
+#endif
+
+#include
+#include
+#include
+#include
+#include "../../util/fd_util.h"
+
+FD_STATIC_ASSERT( FD_FIB4_RTYPE_UNSPEC ==RTN_UNSPEC, linux );
+FD_STATIC_ASSERT( FD_FIB4_RTYPE_UNICAST ==RTN_UNICAST, linux );
+FD_STATIC_ASSERT( FD_FIB4_RTYPE_LOCAL ==RTN_LOCAL, linux );
+FD_STATIC_ASSERT( FD_FIB4_RTYPE_BROADCAST==RTN_BROADCAST, linux );
+FD_STATIC_ASSERT( FD_FIB4_RTYPE_MULTICAST==RTN_MULTICAST, linux );
+FD_STATIC_ASSERT( FD_FIB4_RTYPE_BLACKHOLE==RTN_BLACKHOLE, linux );
+FD_STATIC_ASSERT( FD_FIB4_RTYPE_THROW ==RTN_THROW, linux );
+
+static void
+fd_fib4_rta_gateway( fd_fib4_hop_t * hop,
+ void const * rta,
+ ulong rta_sz ) {
+ if( FD_UNLIKELY( rta_sz!=4UL ) ) {
+ FD_LOG_HEXDUMP_DEBUG(( "Failed to parse RTA_GATEWAY", rta, rta_sz ));
+ hop->flags |= FD_FIB4_FLAG_RTA_PARSE_ERR;
+ return;
+ }
+ uint ip_addr = FD_LOAD( uint, rta ); /* big endian */
+ hop->ip4_gw = ip_addr;
+}
+
+static void
+fd_fib4_rta_oif( fd_fib4_hop_t * hop,
+ void const * rta,
+ ulong rta_sz ) {
+ if( FD_UNLIKELY( rta_sz!=4UL ) ) {
+ FD_LOG_HEXDUMP_DEBUG(( "Failed to parse RTA_OIF", rta, rta_sz ));
+ hop->flags |= FD_FIB4_FLAG_RTA_PARSE_ERR;
+ return;
+ }
+ hop->if_idx = FD_LOAD( uint, rta ); /* host byte order */
+}
+
+static void
+fd_fib4_rta_prefsrc( fd_fib4_hop_t * hop,
+ void const * rta,
+ ulong rta_sz ) {
+ if( FD_UNLIKELY( rta_sz!=4UL ) ) {
+ FD_LOG_HEXDUMP_DEBUG(( "Failed to parse RTA_PREFSRC", rta, rta_sz ));
+ hop->flags |= FD_FIB4_FLAG_RTA_PARSE_ERR;
+ return;
+ }
+ hop->ip4_src = FD_LOAD( uint, rta ); /* big endian */
+}
+
+static int
+fd_fib4_netlink_translate( fd_fib4_t * fib,
+ struct nlmsghdr const * msg_hdr,
+ uint table_id ) {
+ uint ip4_dst = 0U;
+ int prefix = -1; /* -1 indicates unset ip4_dst / prefix */
+ uint prio = 0U; /* default metric */
+
+ fd_fib4_hop_t hop[1] = {0};
+
+ struct rtmsg * msg = NLMSG_DATA( msg_hdr );
+ struct rtattr * rat = RTM_RTA( msg );
+ long rat_sz = (long)(int)RTM_PAYLOAD( msg_hdr );
+
+ switch( msg->rtm_type ) {
+ case RTN_UNICAST:
+ hop->rtype = FD_FIB4_RTYPE_UNICAST;
+ break;
+ case RTN_LOCAL:
+ hop->rtype = FD_FIB4_RTYPE_LOCAL;
+ break;
+ case RTN_BROADCAST:
+ hop->rtype = FD_FIB4_RTYPE_BROADCAST;
+ break;
+ case RTN_MULTICAST:
+ hop->rtype = FD_FIB4_RTYPE_MULTICAST;
+ break;
+ case RTN_BLACKHOLE:
+ hop->rtype = FD_FIB4_RTYPE_BLACKHOLE;
+ break;
+ default:
+ FD_LOG_DEBUG(( "Unsupported route type (%u-%s)", msg->rtm_type, fd_netlink_rtm_type_str( msg->rtm_type ) ));
+ hop->rtype = FD_FIB4_RTYPE_BLACKHOLE;
+ hop->flags |= FD_FIB4_FLAG_RTYPE_UNSUPPORTED;
+ break;
+ }
+
+ for( ; RTA_OK( rat, rat_sz ); rat=RTA_NEXT( rat, rat_sz ) ) {
+ void * rta = RTA_DATA( rat );
+ ulong rta_sz = RTA_PAYLOAD( rat );
+
+ switch( rat->rta_type ) {
+
+ case RTA_GATEWAY:
+ fd_fib4_rta_gateway( hop, rta, rta_sz );
+ break;
+
+ case RTA_DST:
+ if( FD_UNLIKELY( rta_sz!=4UL ) ) {
+ hop->flags |= FD_FIB4_FLAG_RTA_PARSE_ERR;
+ continue;
+ }
+ ip4_dst = FD_LOAD( uint, rta ); /* big endian */
+ prefix = msg->rtm_dst_len;
+ break;
+
+ case RTA_OIF:
+ fd_fib4_rta_oif( hop, rta, rta_sz );
+ break;
+
+ case RTA_PREFSRC:
+ fd_fib4_rta_prefsrc( hop, rta, rta_sz );
+ break;
+
+ case RTA_PRIORITY:
+ if( FD_UNLIKELY( rta_sz!=4UL ) ) {
+ hop->flags |= FD_FIB4_FLAG_RTA_PARSE_ERR;
+ continue;
+ }
+ prio = FD_LOAD( uint, rta ); /* host byte order */
+ break;
+
+ case RTA_TABLE:
+ /* Skip routes that aren't in the requested table */
+ if( FD_UNLIKELY( rta_sz!=4UL ) ) {
+ hop->flags |= FD_FIB4_FLAG_RTA_PARSE_ERR;
+ continue;
+ }
+ if( FD_LOAD( uint, rta )!=table_id ) return 0;
+ break;
+
+ default:
+ FD_LOG_DEBUG(( "Unsupported route table attribute (%u-%s)", rat->rta_type, fd_netlink_rtattr_str( rat->rta_type ) ));
+ hop->flags |= FD_FIB4_FLAG_RTA_UNSUPPORTED;
+ break;
+ }
+ }
+
+ if( fd_fib4_free_cnt( fib )==0UL ) return ENOSPC;
+ *fd_fib4_append( fib, ip4_dst, prefix, prio ) = *hop;
+
+ return 0;
+}
+
+int
+fd_fib4_netlink_load_table( fd_fib4_t * fib,
+ fd_netlink_t * netlink,
+ uint table_id ) {
+
+ uint seq = netlink->seq++;
+
+ struct {
+ struct nlmsghdr nlh; /* Netlink header */
+ struct rtmsg rtm; /* Payload - route message */
+ struct rtattr rta;
+ uint table_id;
+ } request;
+ request.nlh = (struct nlmsghdr) {
+ .nlmsg_type = RTM_GETROUTE,
+ .nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP,
+ .nlmsg_len = sizeof(request),
+ .nlmsg_seq = seq
+ };
+ request.rtm = (struct rtmsg) {
+ .rtm_family = AF_INET, /* IPv4 */
+ };
+ request.rta = (struct rtattr) {
+ .rta_type = RTA_TABLE,
+ .rta_len = RTA_LENGTH( sizeof(uint) )
+ };
+ request.table_id = table_id;
+
+ long send_res = sendto( netlink->fd, &request, sizeof(request), 0, NULL, 0 );
+ if( FD_UNLIKELY( send_res<0 ) ) {
+ FD_LOG_WARNING(( "netlink send(%d,RTM_GETROUTE,NLM_F_REQUEST|NLM_F_DUMP) failed (%d-%s)", netlink->fd, errno, fd_io_strerror( errno ) ));
+ return errno;
+ }
+ if( FD_UNLIKELY( send_res!=sizeof(request) ) ) {
+ FD_LOG_WARNING(( "netlink send(%d,RTM_GETROUTE,NLM_F_REQUEST|NLM_F_DUMP) failed (short write)", netlink->fd ));
+ return EPIPE;
+ }
+
+ fd_fib4_clear( fib );
+
+ int dump_intr = 0;
+ int no_space = 0;
+ ulong route_cnt = 0UL;
+
+ uchar buf[ 4096 ];
+ fd_netlink_iter_t iter[1];
+ for( fd_netlink_iter_init( iter, netlink, buf, sizeof(buf) );
+ !fd_netlink_iter_done( iter );
+ fd_netlink_iter_next( iter, netlink ) ) {
+ struct nlmsghdr const * nlh = fd_netlink_iter_msg( iter );
+ if( FD_UNLIKELY( nlh->nlmsg_flags & NLM_F_DUMP_INTR ) ) dump_intr = 1;
+ if( FD_UNLIKELY( nlh->nlmsg_type==NLMSG_ERROR ) ) {
+ struct nlmsgerr * err = NLMSG_DATA( nlh );
+ int nl_err = -err->error;
+ FD_LOG_WARNING(( "netlink RTM_GETROUTE,NLM_F_REQUEST|NLM_F_DUMP failed (%d-%s)", nl_err, fd_io_strerror( nl_err ) ));
+ return nl_err;
+ }
+ if( FD_UNLIKELY( nlh->nlmsg_type!=RTM_NEWROUTE ) ) {
+ FD_LOG_DEBUG(( "unexpected nlmsg_type %u", nlh->nlmsg_type ));
+ continue;
+ }
+ route_cnt++;
+
+ int translate_err = fd_fib4_netlink_translate( fib, nlh, table_id );
+ if( FD_UNLIKELY( translate_err==ENOSPC ) ) {
+ no_space = 1;
+ break;
+ }
+ }
+ if( FD_UNLIKELY( iter->err > 0 ) ) return FD_FIB_NETLINK_ERR_IO;
+ ulong drain_cnt = fd_netlink_iter_drain( iter, netlink );
+
+ if( no_space ) {
+ FD_LOG_WARNING(( "Routing table is too small! `ip route show table %u` returned %lu entries, which exceeds the configured maximum of %lu",
+ table_id, route_cnt+drain_cnt, fd_fib4_max( fib ) ));
+ fd_fib4_clear( fib );
+ return FD_FIB_NETLINK_ERR_SPACE;
+ }
+
+ if( dump_intr ) {
+ FD_LOG_DEBUG(( "received NLM_F_DUMP_INTR (our read of the routing table was overrun by a concurrent write)" ));
+ return FD_FIB_NETLINK_ERR_INTR;
+ }
+
+ if( FD_UNLIKELY( drain_cnt ) ) {
+ FD_LOG_WARNING(( "Unexpectedly skipped %lu routes. This is a bug!", drain_cnt ));
+ return FD_FIB_NETLINK_ERR_OOPS;
+ }
+
+ fd_fib4_publish( fib );
+
+ return 0;
+}
+
+FD_FN_CONST char const *
+fd_fib4_netlink_strerror( int err ) {
+ switch( err ) {
+ case FD_FIB_NETLINK_SUCCESS:
+ return "success";
+ case FD_FIB_NETLINK_ERR_OOPS:
+ return "oops";
+ case FD_FIB_NETLINK_ERR_IO:
+ return "io";
+ case FD_FIB_NETLINK_ERR_INTR:
+ return "interrupt";
+ case FD_FIB_NETLINK_ERR_SPACE:
+ return "out of space";
+ default:
+ return "unknown";
+ }
+}
diff --git a/src/waltz/ip/fd_fib4_netlink.h b/src/waltz/ip/fd_fib4_netlink.h
new file mode 100644
index 0000000000..cf0e2f169d
--- /dev/null
+++ b/src/waltz/ip/fd_fib4_netlink.h
@@ -0,0 +1,52 @@
+/* fd_fib4_netlink.h provides APIs for importing routes from Linux netlink. */
+
+#if defined(__linux__)
+
+#include "fd_fib4.h"
+#include "fd_netlink1.h"
+
+/* FD_FIB_NETLINK_* gives error codes for netlink import operations. */
+
+#define FD_FIB_NETLINK_SUCCESS (0) /* success */
+#define FD_FIB_NETLINK_ERR_OOPS (1) /* unexpected internal error */
+#define FD_FIB_NETLINK_ERR_IO (2) /* netlink I/O error */
+#define FD_FIB_NETLINK_ERR_INTR (3) /* netlink read was interrupted */
+#define FD_FIB_NETLINK_ERR_SPACE (4) /* fib is too small */
+
+FD_PROTOTYPES_BEGIN
+
+/* fd_fib4_netlink_load_table mirrors a route table from netlink to fib.
+ The route table is requested via RTM_GETROUTE,NLM_F_REQUEST|NLM_F_DUMP.
+ table_id is in [0,2^31). table_id is typically RT_TABLE_LOCAL or
+ RT_TABLE_MAIN. These are 255 and 254 respectively on Linux. Assumes
+ netlink has a usable rtnetlink socket. fib is a writable join to a fib4
+ object in PREPARE or ACTIVE state. Logs to debug level for diagnostics
+ and warning level in case of error.
+
+ Returns FD_FIB4_NETLINK_SUCCESS on success and leaves fib in ACTIVE
+ state and netlink ready for the next request. fib is not guaranteed to
+ mirror the route table precisely even on success. (May turn routes with
+ unsupported type or attribute into blackhole routes.)
+
+ On failure, leaves fib in PREPARE state (which blackholes all packets).
+ Return values FD_FIB4_NETLINK_ERR_{...} in case of error as follows:
+
+ OOPS: Internal error (bug) occurred.
+ IO: Unrecoverable send/recv error or failed to parse MULTIPART msg.
+ INTR: Concurrent write overran read of the routing table. Try again.
+ SPACE: Routing table is too small to mirror the requested table.
+
+ On return, the netlink socket is ready for the next request (even in
+ case of error) unless the error is FD_FIB_NETLINK_ERR_IO. */
+
+int
+fd_fib4_netlink_load_table( fd_fib4_t * fib,
+ fd_netlink_t * netlink,
+ uint table_id );
+
+FD_FN_CONST char const *
+fd_fib4_netlink_strerror( int err );
+
+FD_PROTOTYPES_END
+
+#endif /* defined(__linux__) */
diff --git a/src/waltz/ip/fd_fib4_private.h b/src/waltz/ip/fd_fib4_private.h
new file mode 100644
index 0000000000..cdfdbd52bd
--- /dev/null
+++ b/src/waltz/ip/fd_fib4_private.h
@@ -0,0 +1,40 @@
+#ifndef HEADER_fd_src_waltz_route_fd_fib4_private_h
+#define HEADER_fd_src_waltz_route_fd_fib4_private_h
+
+#include "fd_fib4.h"
+
+struct __attribute__((aligned(FD_FIB4_ALIGN))) fd_fib4_key {
+ /* FIXME optimize this to 8 bytes? */
+ uint addr; /* prefix bits, little endian (low bits outside of mask are undefined) */
+ uint mask; /* bit pattern */
+ uint prio; /* lower is higher */
+};
+
+typedef struct fd_fib4_key fd_fib4_key_t;
+
+struct __attribute__((aligned(FD_FIB4_ALIGN))) fd_fib4 {
+ ulong generation;
+ uint prepare_cnt; /* >0 implies PREPARE state, ==0 implies ACTIVE */
+ uint active_cnt;
+ uint max;
+ ulong hop_off;
+ /* fd_fib4_key_t[] follows */
+ /* fd_fib4_hop_t[] follows */
+};
+
+FD_FN_CONST ulong
+fd_fib4_key_tbl_laddr( fd_fib4_t const * fib ) {
+ return (ulong)fib + sizeof(fd_fib4_t);
+}
+
+FD_FN_PURE ulong
+fd_fib4_hop_tbl_laddr( fd_fib4_t const * fib ) {
+ return (ulong)fib + fib->hop_off;
+}
+
+FD_FN_CONST static inline fd_fib4_key_t const * fd_fib4_key_tbl_const( fd_fib4_t const * fib ) { return (fd_fib4_key_t const *)fd_fib4_key_tbl_laddr( fib ); }
+FD_FN_CONST static inline fd_fib4_key_t * fd_fib4_key_tbl ( fd_fib4_t * fib ) { return (fd_fib4_key_t *) fd_fib4_key_tbl_laddr( fib ); }
+FD_FN_CONST static inline fd_fib4_hop_t const * fd_fib4_hop_tbl_const( fd_fib4_t const * fib ) { return (fd_fib4_hop_t const *)fd_fib4_hop_tbl_laddr( fib ); }
+FD_FN_CONST static inline fd_fib4_hop_t * fd_fib4_hop_tbl ( fd_fib4_t * fib ) { return (fd_fib4_hop_t *) fd_fib4_hop_tbl_laddr( fib ); }
+
+#endif /* HEADER_fd_src_waltz_route_fd_fib4_private_h */
diff --git a/src/waltz/ip/fd_netlink1.c b/src/waltz/ip/fd_netlink1.c
new file mode 100644
index 0000000000..6fc4871b80
--- /dev/null
+++ b/src/waltz/ip/fd_netlink1.c
@@ -0,0 +1,249 @@
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "fd_netlink1.h"
+#include "../../util/fd_util.h"
+
+FD_TL ulong fd_netlink_enobufs_cnt;
+
+static int
+fd_nl_create_socket( void ) {
+ int fd = socket( AF_NETLINK, SOCK_RAW, NETLINK_ROUTE );
+
+ if( FD_UNLIKELY( fd<0 ) ) {
+ FD_LOG_WARNING(( "socket(AF_NETLINK,SOCK_RAW,NETLINK_ROUTE) failed (%i-%s)",
+ errno, fd_io_strerror( errno ) ));
+ return -1;
+ }
+
+ int one = 1;
+ if( setsockopt( fd, SOL_NETLINK, NETLINK_EXT_ACK, &one, sizeof(one) )<0 ) {
+ FD_LOG_WARNING(( "setsockopt(sock,SOL_NETLINK,NETLINK_EXT_ACK) failed (%i-%s)",
+ errno, fd_io_strerror( errno ) ));
+ close( fd );
+ return -1;
+ }
+
+ return fd;
+}
+
+static void
+fd_nl_close_socket( int fd ) {
+ if( fd >= 0 ) {
+ close( fd );
+ }
+}
+
+long
+fd_netlink_read_socket( int fd,
+ uchar * buf,
+ ulong buf_sz ) {
+ /* netlink is datagram based
+ once a recv succeeds, any un-received bytes are lost
+ and the next datagram will be properly aligned in the buffer */
+ for(;;) {
+ long len = recvfrom( fd, buf, buf_sz, 0, NULL, NULL );
+ if( FD_UNLIKELY( len<=0L ) ) {
+ if( len==0L ) continue;
+ if( errno==EINTR ) continue;
+ if( errno==ENOBUFS ) {
+ fd_netlink_enobufs_cnt++;
+ continue;
+ }
+ FD_LOG_WARNING(( "netlink recv failed (%d-%s)", errno, fd_io_strerror( errno ) ));
+ return -(long)errno;
+ }
+ return len;
+ }
+}
+
+fd_netlink_t *
+fd_netlink_init( fd_netlink_t * nl,
+ uint seq0 ) {
+ nl->fd = fd_nl_create_socket();
+ if( FD_UNLIKELY( nl->fd<0 ) ) return NULL;
+ nl->seq = seq0;
+ return nl;
+}
+
+void *
+fd_netlink_fini( fd_netlink_t * nl ) {
+ fd_nl_close_socket( nl->fd );
+ nl->fd = -1;
+ return nl;
+}
+
+static void
+fd_netlink_iter_recvmsg( fd_netlink_iter_t * iter,
+ fd_netlink_t * netlink ) {
+ long len = fd_netlink_read_socket( netlink->fd, iter->buf, iter->buf_sz );
+ if( len<0L ) {
+ iter->err = (int)-len;
+ return;
+ }
+ iter->msg0 = iter->buf;
+ iter->msg1 = iter->buf+len;
+}
+
+/* fd_netlink_iter_verify_next bounds checks the next message. If out-of-
+ bounds, logs warning and sets error EPROTO. This prevents the iterator
+ from returning an out-of-bounds netlink message. */
+
+static void
+fd_netlink_iter_bounds_check( fd_netlink_iter_t * iter ) {
+ if( fd_netlink_iter_done( iter ) ) return;
+
+ struct nlmsghdr const * nlh = fd_type_pun_const( iter->msg0 );
+ if( FD_UNLIKELY( iter->msg0 + sizeof(struct nlmsghdr) > iter->msg1 ) ) {
+ FD_LOG_WARNING(( "netlink message header out-of-bounds" ));
+ iter->err = EPROTO;
+ return;
+ }
+ if( FD_UNLIKELY( nlh->nlmsg_len < sizeof(struct nlmsghdr) ) ) {
+ /* prevent infinite loop */
+ FD_LOG_WARNING(( "netlink message smaller than header" ));
+ iter->err = EPROTO;
+ return;
+ }
+ if( FD_UNLIKELY( iter->msg0 + nlh->nlmsg_len > iter->msg1 ) ) {
+ FD_LOG_WARNING(( "netlink message out-of-bounds: cur=[%p,%p) buf=[%p,%p)",
+ (void *)iter->msg0, (void *)iter->msg1, (void *)iter->buf, (void *)( iter->buf+iter->buf_sz ) ));
+ iter->err = EPROTO;
+ return;
+ }
+}
+
+fd_netlink_iter_t *
+fd_netlink_iter_init( fd_netlink_iter_t * iter,
+ fd_netlink_t * netlink,
+ uchar * buf,
+ ulong buf_sz ) {
+ *iter = (fd_netlink_iter_t) {
+ .buf = buf,
+ .buf_sz = buf_sz,
+ .msg0 = buf,
+ .msg1 = buf,
+ };
+
+ fd_netlink_iter_recvmsg( iter, netlink );
+ fd_netlink_iter_bounds_check( iter );
+
+ return iter;
+}
+
+int
+fd_netlink_iter_done( fd_netlink_iter_t const * iter ) {
+ if( (iter->err!=0) | ( iter->msg1 - iter->msg0 < (long)sizeof(struct nlmsghdr) ) ) {
+ return 1;
+ }
+ struct nlmsghdr const * nlh = fd_type_pun_const( iter->msg0 );
+ return nlh->nlmsg_type==NLMSG_DONE;
+}
+
+fd_netlink_iter_t *
+fd_netlink_iter_next( fd_netlink_iter_t * iter,
+ fd_netlink_t * netlink ) {
+
+ if( fd_netlink_iter_done( iter ) ) return iter;
+
+ struct nlmsghdr const * nlh = fd_type_pun_const( iter->msg0 );
+ if( !(nlh->nlmsg_flags & NLM_F_MULTI) ) {
+ /* Last message was not a multipart message */
+ iter->err = -1; /* eof */
+ return iter;
+ }
+ iter->msg0 += NLMSG_ALIGN( nlh->nlmsg_len );
+
+ if( iter->msg0 >= iter->msg1 ) {
+ fd_netlink_iter_recvmsg( iter, netlink );
+ }
+ fd_netlink_iter_bounds_check( iter );
+
+ return iter;
+}
+
+char const *
+fd_netlink_rtm_type_str( int rtm_type ) {
+ switch( rtm_type ) {
+ case RTN_UNSPEC: return "unspec";
+ case RTN_UNICAST: return "unicast";
+ case RTN_LOCAL: return "local";
+ case RTN_BROADCAST: return "broadcast";
+ case RTN_ANYCAST: return "anycast";
+ case RTN_MULTICAST: return "multicast";
+ case RTN_BLACKHOLE: return "blackhole";
+ case RTN_UNREACHABLE: return "unreachable";
+ case RTN_PROHIBIT: return "prohibit";
+ case RTN_THROW: return "throw";
+ case RTN_NAT: return "nat";
+ case RTN_XRESOLVE: return "xresolve";
+ default: return "unknown";
+ }
+}
+
+char const *
+fd_netlink_rtattr_str( int rta_type ) {
+ switch( rta_type ) {
+ /* These exist since at least Linux v3.7 */
+ case RTA_DST: return "dst";
+ case RTA_SRC: return "src";
+ case RTA_IIF: return "iif";
+ case RTA_OIF: return "oif";
+ case RTA_GATEWAY: return "gateway";
+ case RTA_PRIORITY: return "priority";
+ case RTA_PREFSRC: return "prefsrc";
+ case RTA_METRICS: return "metrics";
+ case RTA_MULTIPATH: return "multipath";
+ case RTA_FLOW: return "flow";
+ case RTA_CACHEINFO: return "cacheinfo";
+ case RTA_TABLE: return "table";
+ case RTA_MARK: return "mark";
+#ifdef RTA_MFC_STATS
+ case RTA_MFC_STATS: return "mfc_stats";
+#endif
+#ifdef RTA_VIA
+ case RTA_VIA: return "via";
+#endif
+#ifdef RTA_NEWDST
+ case RTA_NEWDST: return "newdst";
+#endif
+#ifdef RTA_PREF
+ case RTA_PREF: return "pref";
+#endif
+#ifdef RTA_ENCAP_TYPE
+ case RTA_ENCAP_TYPE: return "encap_type";
+#endif
+#ifdef RTA_ENCAP
+ case RTA_ENCAP: return "encap";
+#endif
+#ifdef RTA_EXPIRES
+ case RTA_EXPIRES: return "expires";
+#endif
+#ifdef RTA_PAD
+ case RTA_PAD: return "pad";
+#endif
+#ifdef RTA_UID
+ case RTA_UID: return "uid";
+#endif
+#ifdef RTA_TTL_PROPAGATE
+ case RTA_TTL_PROPAGATE: return "ttl_propagate";
+#endif
+#ifdef RTA_IP_PROTO
+ case RTA_IP_PROTO: return "ip_proto";
+#endif
+#ifdef RTA_SPORT
+ case RTA_SPORT: return "sport";
+#endif
+#ifdef RTA_DPORT
+ case RTA_DPORT: return "dport";
+#endif
+#ifdef RTA_NH_ID
+ case RTA_NH_ID: return "nh_id";
+#endif
+ default: return "unknown";
+ }
+}
diff --git a/src/waltz/ip/fd_netlink1.h b/src/waltz/ip/fd_netlink1.h
new file mode 100644
index 0000000000..e43bad39c5
--- /dev/null
+++ b/src/waltz/ip/fd_netlink1.h
@@ -0,0 +1,107 @@
+#ifndef HEADER_fd_src_waltz_ip_fd_netlink_h
+#define HEADER_fd_src_waltz_ip_fd_netlink_h
+
+#if defined(__linux__)
+
+#include "../../util/fd_util_base.h"
+
+struct fd_netlink {
+ int fd; /* netlink socket */
+ uint seq; /* netlink sequence number */
+};
+
+typedef struct fd_netlink fd_netlink_t;
+
+/* FIXME this should be a 'buffered reader' style API not an iterator since
+ iterators are infallible by definition in Firedancer style. */
+
+struct fd_netlink_iter {
+ uchar * buf;
+ ulong buf_sz;
+ uchar * msg0;
+ uchar * msg1;
+ int err;
+};
+
+typedef struct fd_netlink_iter fd_netlink_iter_t;
+
+FD_PROTOTYPES_BEGIN
+
+/* fd_netlink_enobufs_cnt counts the number of ENOBUFS error occurrences. */
+
+extern FD_TL ulong fd_netlink_enobufs_cnt;
+
+/* fd_netlink_init creates a new netlink session. Creates a new netlink
+ socket with explicit ACKs. seq0 is the initial sequence number. */
+
+fd_netlink_t *
+fd_netlink_init( fd_netlink_t * netlink,
+ uint seq0 );
+
+/* fd_netlink_fini closes the netlink socket. */
+
+void *
+fd_netlink_fini( fd_netlink_t * netlink );
+
+/* fd_netlink_read_socket wraps recvfrom(fd,buf,buf_sz,0,0,0) but
+ automatically skips EINTR and ENOBUFS errors. */
+
+long
+fd_netlink_read_socket( int fd,
+ uchar * buf,
+ ulong buf_sz );
+
+/* fd_netlink_iter_init prepares iteration over a sequence of incoming
+ netlink multipart messages. */
+
+fd_netlink_iter_t *
+fd_netlink_iter_init( fd_netlink_iter_t * iter,
+ fd_netlink_t * netlink,
+ uchar * buf,
+ ulong buf_sz );
+
+/* fd_netlink_iter_done returns 0 if there are more netlink messages to
+ iterate over or 1 if not. */
+
+int
+fd_netlink_iter_done( fd_netlink_iter_t const * iter );
+
+/* fd_netlink_iter_next advances the iterator to the next netlink message
+ (if any). Assumes !fd_netlink_iter_done(iter). Invalidates pointers
+ previously returned by fd_netlink_iter_msg(iter). */
+
+fd_netlink_iter_t *
+fd_netlink_iter_next( fd_netlink_iter_t * iter,
+ fd_netlink_t * netlink );
+
+/* fd_netlink_iter_msg returns a pointer to the current netlink message
+ header. Assumes !fd_netlink_iter_done(iter). */
+
+static inline struct nlmsghdr const *
+fd_netlink_iter_msg( fd_netlink_iter_t const * iter ) {
+ return fd_type_pun_const( iter->msg0 );
+}
+
+static FD_FN_UNUSED ulong
+fd_netlink_iter_drain( fd_netlink_iter_t * iter,
+ fd_netlink_t * netlink ) {
+ ulong cnt;
+ for( cnt=0UL; !fd_netlink_iter_done( iter ); cnt++ ) {
+ fd_netlink_iter_next( iter, netlink );
+ }
+ return cnt;
+}
+
+/* Debug utils */
+
+char const *
+fd_netlink_rtm_type_str( int rtm_type );
+
+char const *
+fd_netlink_rtattr_str( int rta_type );
+
+FD_PROTOTYPES_END
+
+#endif /* defined(__linux__) */
+
+#endif /* HEADER_fd_src_waltz_ip_fd_netlink_h */
diff --git a/src/waltz/ip/test_fib4.c b/src/waltz/ip/test_fib4.c
new file mode 100644
index 0000000000..a4cb776a34
--- /dev/null
+++ b/src/waltz/ip/test_fib4.c
@@ -0,0 +1,150 @@
+#define _POSIX_C_SOURCE 200809L /* fmemopen */
+#include "fd_fib4.h"
+#include "../../util/fd_util.h"
+#include "../../util/net/fd_ip4.h"
+
+static uchar __attribute__((aligned(FD_FIB4_ALIGN)))
+fib1_mem[ 4096 ];
+
+static uchar __attribute__((aligned(FD_FIB4_ALIGN)))
+fib2_mem[ 4096 ];
+
+#if FD_HAS_HOSTED
+#include
+
+static void
+test_fib_print( fd_fib4_t const * fib,
+ char const * actual ) {
+ static char dump_buf[ 8192 ];
+ FILE * dump = fmemopen( dump_buf, sizeof(dump_buf), "w" );
+ FD_TEST( 0==fd_fib4_fprintf( fib, dump ) );
+ ulong sz = (ulong)ftell( dump );
+ fclose( dump );
+
+ if( FD_UNLIKELY( 0!=strncmp( dump_buf, actual, sz ) ) ) {
+ fwrite( dump_buf, 1, sz, stderr );
+ fflush( stderr );
+ FD_LOG_ERR(( "FAIL: fd_fib4_fprintf(fib) != expected" ));
+ }
+}
+
+#else /* !FD_HAS_HOSTED */
+
+#define test_fib_print(...)
+
+#endif
+
+int
+main( int argc,
+ char ** argv ) {
+ fd_boot( &argc, &argv );
+
+ FD_TEST( fd_ulong_is_aligned( (ulong)fib1_mem, fd_fib4_align() ) );
+ FD_TEST( fd_fib4_footprint( 16 )<=sizeof(fib1_mem) );
+ fd_fib4_t * fib_local = fd_fib4_join( fd_fib4_new( fib1_mem, 16 ) );
+ fd_fib4_t * fib_main = fd_fib4_join( fd_fib4_new( fib2_mem, 16 ) );
+ fd_fib4_hop_t candidate[2];
+
+ /* Ensure FIB in PREPARE state returns BLACKHOLE */
+
+ FD_TEST( fd_fib4_lookup( fib_local, candidate, 0x12345678, 0 )->rtype==FD_FIB4_RTYPE_BLACKHOLE );
+
+ /* Ensure empty FIB in ACTIVE returns THROW */
+
+ fd_fib4_publish( fib_local );
+ FD_TEST( fd_fib4_lookup( fib_local, candidate, 0x12345678, 0 )->rtype==FD_FIB4_RTYPE_THROW );
+
+ /* Simple production scenario
+
+ # ip route list table local
+ broadcast 192.0.2.160 dev bond0 proto kernel scope link src 192.0.2.165
+ local 192.0.2.165 dev bond0 proto kernel scope host src 192.0.2.165
+ broadcast 192.0.2.191 dev bond0 proto kernel scope link src 192.0.2.165
+ broadcast 127.0.0.0 dev lo proto kernel scope link src 127.0.0.1
+ local 127.0.0.0/8 dev lo proto kernel scope host src 127.0.0.1
+ local 127.0.0.1 dev lo proto kernel scope host src 127.0.0.1
+ broadcast 127.255.255.255 dev lo proto kernel scope link src 127.0.0.1
+
+ # ip route list table main
+ default via 192.0.2.161 dev bond0 proto dhcp src 192.0.2.165 metric 300
+ 192.0.2.160/27 dev bond0 proto kernel scope link src 192.0.2.165 metric 300 */
+
+ fd_fib4_clear( fib_local );
+ FD_TEST( fd_fib4_free_cnt( fib_local )>=7 );
+ *fd_fib4_append( fib_local, FD_IP4_ADDR( 192,0,2,160 ), 32, 0 ) = (fd_fib4_hop_t){ .rtype=FD_FIB4_RTYPE_BROADCAST, .if_idx=6, .scope=253, .ip4_src=FD_IP4_ADDR( 192,0,2,165 ) };
+ *fd_fib4_append( fib_local, FD_IP4_ADDR( 192,0,2,165 ), 32, 0 ) = (fd_fib4_hop_t){ .rtype=FD_FIB4_RTYPE_LOCAL, .if_idx=6, .scope=254, .ip4_src=FD_IP4_ADDR( 192,0,2,165 ) };
+ *fd_fib4_append( fib_local, FD_IP4_ADDR( 192,0,2,191 ), 32, 0 ) = (fd_fib4_hop_t){ .rtype=FD_FIB4_RTYPE_BROADCAST, .if_idx=6, .scope=253, .ip4_src=FD_IP4_ADDR( 192,0,2,165 ) };
+ *fd_fib4_append( fib_local, FD_IP4_ADDR( 127,0,0,0 ), 32, 0 ) = (fd_fib4_hop_t){ .rtype=FD_FIB4_RTYPE_BROADCAST, .if_idx=1, .scope=253, .ip4_src=FD_IP4_ADDR( 127,0,0,1 ) };
+ *fd_fib4_append( fib_local, FD_IP4_ADDR( 127,0,0,0 ), 8, 0 ) = (fd_fib4_hop_t){ .rtype=FD_FIB4_RTYPE_LOCAL, .if_idx=1, .scope=254, .ip4_src=FD_IP4_ADDR( 127,0,0,1 ) };
+ *fd_fib4_append( fib_local, FD_IP4_ADDR( 127,0,0,1 ), 32, 0 ) = (fd_fib4_hop_t){ .rtype=FD_FIB4_RTYPE_LOCAL, .if_idx=1, .scope=254, .ip4_src=FD_IP4_ADDR( 127,0,0,1 ) };
+ *fd_fib4_append( fib_local, FD_IP4_ADDR( 127,0,255,255 ), 32, 0 ) = (fd_fib4_hop_t){ .rtype=FD_FIB4_RTYPE_BROADCAST, .if_idx=1, .scope=253, .ip4_src=FD_IP4_ADDR( 127,0,0,1 ) };
+ fd_fib4_publish( fib_local );
+
+ test_fib_print( fib_local,
+ "throw default metric 4294967295\n"
+ "broadcast 192.0.2.160 dev 6 scope link src 192.0.2.165\n"
+ "local 192.0.2.165 dev 6 scope host src 192.0.2.165\n"
+ "broadcast 192.0.2.191 dev 6 scope link src 192.0.2.165\n"
+ "broadcast 127.0.0.0 dev 1 scope link src 127.0.0.1\n"
+ "local 127.0.0.0/8 dev 1 scope host src 127.0.0.1\n"
+ "local 127.0.0.1 dev 1 scope host src 127.0.0.1\n"
+ "broadcast 127.0.255.255 dev 1 scope link src 127.0.0.1\n" );
+
+ fd_fib4_clear( fib_main );
+ FD_TEST( fd_fib4_free_cnt( fib_main )>=2 );
+ *fd_fib4_append( fib_main, FD_IP4_ADDR( 0,0,0,0 ), 0, 300 ) = (fd_fib4_hop_t){ .rtype=FD_FIB4_RTYPE_UNICAST, .ip4_gw=FD_IP4_ADDR( 192,0,2,161 ), .if_idx=6, .ip4_src=FD_IP4_ADDR( 192,0,2,165 ) };
+ *fd_fib4_append( fib_main, FD_IP4_ADDR( 192,0,2,161 ), 27, 300 ) = (fd_fib4_hop_t){ .rtype=FD_FIB4_RTYPE_UNICAST, .if_idx=6, .scope=253, .ip4_src=FD_IP4_ADDR( 192,0,2,165 ) };
+ fd_fib4_publish( fib_main );
+
+ test_fib_print( fib_main,
+ "throw default metric 4294967295\n"
+ "default via 192.0.2.161 dev 6 src 192.0.2.165 metric 300\n"
+ "192.0.2.161/27 dev 6 scope link src 192.0.2.165 metric 300\n" );
+
+# define QUERY(ip) fd_fib4_hop_or( fd_fib4_lookup( fib_local, candidate+0, FD_IP4_ADDR ip, 0 ), fd_fib4_lookup( fib_main, candidate+1, FD_IP4_ADDR ip, 0 ) )
+ fd_fib4_hop_t const * next;
+
+ /* $ ip route get 127.0.0.1
+ local 127.0.0.1 dev lo src 127.0.0.1 */
+ next = QUERY(( 127,0,0,1 ));
+ FD_TEST( next->rtype==FD_FIB4_RTYPE_LOCAL );
+ FD_TEST( next->if_idx==1 );
+ FD_TEST( next->ip4_src==FD_IP4_ADDR( 127,0,0,1 ) );
+
+ /* $ ip route get 192.0.2.160
+ broadcast 192.0.2.160 dev bond0 src 192.0.2.165 */
+ next = QUERY(( 192,0,2,160 ));
+ FD_TEST( next->rtype==FD_FIB4_RTYPE_BROADCAST );
+ FD_TEST( next->if_idx==6 );
+ FD_TEST( next->ip4_src==FD_IP4_ADDR( 192,0,2,165 ) );
+
+ /* $ ip route get 192.0.2.161
+ 192.0.2.161 dev bond0 src 192.0.2.165 */
+ next = QUERY(( 192,0,2,161 ));
+ FD_TEST( next->rtype==FD_FIB4_RTYPE_UNICAST );
+ FD_TEST( next->if_idx==6 );
+ FD_TEST( next->ip4_src==FD_IP4_ADDR( 192,0,2,165 ) );
+
+ /* $ ip route get 192.0.2.191
+ broadcast 192.0.2.191 dev bond0 src 192.0.2.165 */
+ next = QUERY(( 192,0,2,191 ));
+ FD_TEST( next->rtype==FD_FIB4_RTYPE_BROADCAST );
+ FD_TEST( next->if_idx==6 );
+ FD_TEST( next->ip4_src==FD_IP4_ADDR( 192,0,2,165 ) );
+
+ /* $ ip route get 8.8.8.8
+ 8.8.8.8 via 192.0.2.161 dev bond0 src 192.0.2.165 */
+ next = QUERY(( 8,8,8,8 ));
+ FD_TEST( next->rtype==FD_FIB4_RTYPE_UNICAST );
+ FD_TEST( next->ip4_gw==FD_IP4_ADDR( 192,0,2,161 ) );
+ FD_TEST( next->if_idx==6 );
+ FD_TEST( next->ip4_src==FD_IP4_ADDR( 192,0,2,165 ) );
+
+# undef QUERY
+
+ fd_fib4_delete( fd_fib4_leave( fib_local ) );
+ fd_fib4_delete( fd_fib4_leave( fib_main ) );
+
+ fd_halt();
+ return 0;
+}
diff --git a/src/waltz/ip/test_fib4_netlink.c b/src/waltz/ip/test_fib4_netlink.c
new file mode 100644
index 0000000000..91b3aa44d5
--- /dev/null
+++ b/src/waltz/ip/test_fib4_netlink.c
@@ -0,0 +1,53 @@
+#include
+#include /* RT_TABLE_MAIN */
+#include "fd_fib4_netlink.h"
+#include "../../util/fd_util.h"
+
+#define DEFAULT_FIB_SZ (1<<20) /* 1 MiB */
+
+static uchar __attribute__((aligned(FD_FIB4_ALIGN)))
+fib1_mem[ DEFAULT_FIB_SZ ];
+
+/* Translate local and main tables and dump them to stdout */
+
+void
+dump_table( fd_netlink_t * netlink,
+ uint table ) {
+ ulong const route_max = 256UL;
+ FD_TEST( fd_fib4_footprint( route_max )<=sizeof(fib1_mem) );
+ fd_fib4_t * fib = fd_fib4_join( fd_fib4_new( fib1_mem, route_max ) );
+
+ int load_err = fd_fib4_netlink_load_table( fib, netlink, table );
+ if( FD_UNLIKELY( load_err ) ) {
+ FD_LOG_WARNING(( "Failed to load table %u (%i-%s)", table, load_err, fd_fib4_netlink_strerror( load_err ) ));
+ return;
+ }
+
+ fprintf( stderr, "# ip route show table %u\n", table );
+ fd_log_flush();
+ fd_fib4_fprintf( fib, stderr );
+ fputs( "\n", stderr );
+
+ fd_fib4_delete( fd_fib4_leave( fib ) );
+}
+
+int
+main( int argc,
+ char ** argv ) {
+ fd_boot( &argc, &argv );
+
+ fd_netlink_t _netlink[1];
+ fd_netlink_t * netlink = fd_netlink_init( _netlink, 42U );
+ FD_TEST( netlink );
+
+ FD_LOG_NOTICE(( "Dumping local and main routing tables to stderr\n" ));
+ fd_log_flush();
+ dump_table( netlink, RT_TABLE_LOCAL );
+ dump_table( netlink, RT_TABLE_MAIN );
+ fflush( stderr );
+
+ fd_netlink_fini( netlink );
+
+ fd_halt();
+ return 0;
+}
diff --git a/src/waltz/mib/Local.mk b/src/waltz/mib/Local.mk
new file mode 100644
index 0000000000..e8f1215d93
--- /dev/null
+++ b/src/waltz/mib/Local.mk
@@ -0,0 +1,9 @@
+$(call add-hdrs,fd_dbl_buf.h)
+$(call add-objs,fd_dbl_buf,fd_waltz)
+$(call add-hdrs,fd_netdev_tbl.h)
+$(call add-objs,fd_netdev_tbl,fd_waltz)
+ifdef FD_HAS_LINUX
+$(call add-hdrs,fd_netdev_netlink.h)
+$(call add-objs,fd_netdev_netlink,fd_waltz)
+$(call make-unit-test,test_netdev_netlink,test_netdev_netlink,fd_waltz fd_util)
+endif
diff --git a/src/waltz/mib/fd_dbl_buf.c b/src/waltz/mib/fd_dbl_buf.c
new file mode 100644
index 0000000000..36191ea230
--- /dev/null
+++ b/src/waltz/mib/fd_dbl_buf.c
@@ -0,0 +1,138 @@
+#include "fd_dbl_buf.h"
+#include "../../util/log/fd_log.h"
+#include "../../tango/fd_tango_base.h"
+
+#if FD_HAS_SSE
+#include "../../util/simd/fd_sse.h"
+#endif
+
+ulong
+fd_dbl_buf_align( void ) {
+ return FD_DBL_BUF_ALIGN;
+}
+
+ulong
+fd_dbl_buf_footprint( ulong mtu ) {
+ return FD_DBL_BUF_FOOTPRINT( mtu );
+}
+
+void *
+fd_dbl_buf_new( void * shmem,
+ ulong mtu,
+ ulong seq0 ) {
+
+ if( FD_UNLIKELY( !shmem ) ) {
+ FD_LOG_WARNING(( "NULL shmem" ));
+ return NULL;
+ }
+
+ if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)shmem, FD_DBL_BUF_ALIGN ) ) ) {
+ FD_LOG_WARNING(( "misaligned shmem" ));
+ return NULL;
+ }
+
+ ulong mtu_align = fd_ulong_align_up( mtu, FD_DBL_BUF_ALIGN );
+ FD_SCRATCH_ALLOC_INIT( l, shmem );
+ fd_dbl_buf_t * dbl_buf = FD_SCRATCH_ALLOC_APPEND( l, FD_DBL_BUF_ALIGN, sizeof(fd_dbl_buf_t) );
+ void * buf0 = FD_SCRATCH_ALLOC_APPEND( l, FD_DBL_BUF_ALIGN, mtu_align );
+ void * buf1 = FD_SCRATCH_ALLOC_APPEND( l, FD_DBL_BUF_ALIGN, mtu_align );
+ FD_SCRATCH_ALLOC_FINI( l, FD_DBL_BUF_ALIGN );
+
+ *dbl_buf = (fd_dbl_buf_t) {
+ .magic = 0UL,
+ .seq = seq0,
+ .sz = 0UL,
+ .mtu = mtu,
+ .buf0 = (ulong)buf0 - (ulong)dbl_buf,
+ .buf1 = (ulong)buf1 - (ulong)dbl_buf
+ };
+
+ FD_COMPILER_MFENCE();
+ FD_VOLATILE( dbl_buf->magic ) = FD_DBL_BUF_MAGIC;
+ FD_COMPILER_MFENCE();
+
+ return dbl_buf;
+}
+
+fd_dbl_buf_t *
+fd_dbl_buf_join( void * shbuf ) {
+
+ if( FD_UNLIKELY( !shbuf ) ) {
+ FD_LOG_WARNING(( "NULL shbuf" ));
+ return NULL;
+ }
+
+ if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)shbuf, FD_DBL_BUF_ALIGN ) ) ) {
+ FD_LOG_WARNING(( "misaligned shbuf" ));
+ return NULL;
+ }
+
+ fd_dbl_buf_t * dbl_buf = shbuf;
+ if( FD_UNLIKELY( dbl_buf->magic!=FD_DBL_BUF_MAGIC ) ) {
+ FD_LOG_WARNING(( "bad magic" ));
+ return NULL;
+ }
+
+ return dbl_buf;
+}
+
+void *
+fd_dbl_buf_leave( fd_dbl_buf_t * buf ) {
+ return buf;
+}
+
+void *
+fd_dbl_buf_delete( void * shbuf ) {
+
+ if( FD_UNLIKELY( !shbuf ) ) {
+ FD_LOG_WARNING(( "NULL shbuf" ));
+ return NULL;
+ }
+
+ if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)shbuf, FD_DBL_BUF_ALIGN ) ) ) {
+ FD_LOG_WARNING(( "misaligned shbuf" ));
+ return NULL;
+ }
+
+ fd_dbl_buf_t * dbl_buf = shbuf;
+ FD_COMPILER_MFENCE();
+ FD_VOLATILE( dbl_buf->magic ) = 0UL;
+ FD_COMPILER_MFENCE();
+ return dbl_buf;
+}
+
+void
+fd_dbl_buf_insert( fd_dbl_buf_t * buf,
+ void const * msg,
+ ulong sz ) {
+ /* */ sz = fd_ulong_min( sz, buf->mtu );
+ ulong seq = fd_seq_inc( buf->seq, 1UL );
+ void * dst = fd_dbl_buf_slot( buf, seq );
+
+ fd_memcpy( dst, msg, sz );
+
+# if FD_HAS_SSE
+ FD_COMPILER_MFENCE();
+ vv_t seq_sz = vv( seq, sz );
+ _mm_store_si128( &buf->seq_sz, seq_sz );
+ FD_COMPILER_MFENCE();
+# else
+ buf->sz = sz;
+ FD_COMPILER_MFENCE();
+ buf->seq = seq;
+ FD_COMPILER_MFENCE();
+# endif
+}
+
+ulong
+fd_dbl_buf_read( fd_dbl_buf_t * buf,
+ void * obj,
+ ulong * opt_seqp ) {
+ ulong _seq[1];
+ ulong * seqp = opt_seqp ? opt_seqp : _seq;
+ ulong sz;
+ do {
+ sz = fd_dbl_buf_try_read( buf, obj, seqp );
+ } while( FD_UNLIKELY( sz==ULONG_MAX ) );
+ return sz;
+}
diff --git a/src/waltz/mib/fd_dbl_buf.h b/src/waltz/mib/fd_dbl_buf.h
new file mode 100644
index 0000000000..fbe39d9be9
--- /dev/null
+++ b/src/waltz/mib/fd_dbl_buf.h
@@ -0,0 +1,165 @@
+#ifndef HEADER_fd_src_waltz_mib_fd_dbl_buf_h
+#define HEADER_fd_src_waltz_mib_fd_dbl_buf_h
+
+/* fd_dbl_buf.h provides a concurrent lock-free double buffer. A double
+ buffer contains two buffers that take turns holding a message for
+ consumers and receving a new message by a producer.
+
+ Supports a single producer thread and an arbitrary number of consumer
+ threads. Optimized for rare updates and frequent polling (e.g. config).
+ Use an fd_tango mcache/dcache pair if you need frequent updates.
+
+ Currently assumes a memory model that preserves store order across
+ threads (e.g. x86-TSO). Does not use atomics or hardware fences. */
+
+#include "../../util/bits/fd_bits.h"
+#if FD_HAS_SSE
+#include
+#endif
+
+/* FIXME COULD ALLOW FOR IN-PLACE READS WITH PODs BY ADDING A MSG ALIGN ARGUMENT */
+
+/* fd_dbl_buf_t is the header of a dbl_buf object. May not be locally
+ declared. */
+
+union __attribute__((aligned(16UL))) fd_dbl_buf {
+
+ struct {
+ ulong magic; /* ==FD_DBL_BUF_MAGIC */
+ ulong mtu;
+ ulong buf0; /* offset to first buffer from beginning of struct */
+ ulong buf1; /* — " — second — " — */
+ ulong seq; /* latest msg seq no */
+ ulong sz; /* latest msg size */
+ ulong pad[2];
+ /* objects follow here */
+ };
+
+# if FD_HAS_SSE
+ struct {
+ __m128i magic_mtu;
+ __m128i buf0_buf1;
+ __m128i seq_sz;
+ __m128i pad2;
+ };
+# endif
+
+};
+
+typedef union fd_dbl_buf fd_dbl_buf_t;
+
+#define FD_DBL_BUF_MAGIC (0xa6c6f85d431c03ceUL) /* random */
+
+#define FD_DBL_BUF_ALIGN (16UL)
+#define FD_DBL_BUF_FOOTPRINT(mtu) \
+ FD_LAYOUT_FINI( FD_LAYOUT_APPEND( FD_LAYOUT_APPEND( FD_LAYOUT_INIT, \
+ FD_DBL_BUF_ALIGN, sizeof(fd_dbl_buf_t) ), \
+ FD_DBL_BUF_ALIGN, FD_ULONG_ALIGN_UP( mtu, FD_DBL_BUF_ALIGN )<<1UL ), \
+ FD_DBL_BUF_ALIGN )
+
+FD_PROTOTYPES_BEGIN
+
+/* fd_dbl_buf_{align,footprint} describe the memory region of a double
+ buffer. mtu is the largest possible message size. */
+
+ulong
+fd_dbl_buf_align( void );
+
+ulong
+fd_dbl_buf_footprint( ulong mtu );
+
+/* fd_dbl_buf_new formats a memory region for use as a double buffer.
+ shmem points to the memory region matching fd_dbl_buf_{align,footprint}.
+ Initially, the active object of the double buffer will have sequence
+ number seq0 and zero byte size. */
+
+void *
+fd_dbl_buf_new( void * shmem,
+ ulong mtu,
+ ulong seq0 );
+
+fd_dbl_buf_t *
+fd_dbl_buf_join( void * shbuf );
+
+void *
+fd_dbl_buf_leave( fd_dbl_buf_t * buf );
+
+/* fd_dbl_buf_delete unformats the memory region backing a dbl_buf and
+ releases ownership back to the caller. Returns shbuf. */
+
+void *
+fd_dbl_buf_delete( void * shbuf );
+
+/* fd_dbl_buf_obj_mtu returns the max message size a dbl_buf can store. */
+
+static inline ulong
+fd_dbl_buf_obj_mtu( fd_dbl_buf_t * buf ) {
+ return buf->mtu;
+}
+
+/* fd_dbl_buf_seq_query peeks the current sequence number. */
+
+static inline ulong
+fd_dbl_buf_seq_query( fd_dbl_buf_t * buf ) {
+ FD_COMPILER_MFENCE();
+ ulong seq = FD_VOLATILE_CONST( buf->seq );
+ FD_COMPILER_MFENCE();
+ return seq;
+}
+
+/* fd_dbl_buf_slot returns a pointer to the buffer for the given sequence
+ number. */
+
+FD_FN_PURE static inline void *
+fd_dbl_buf_slot( fd_dbl_buf_t * buf,
+ ulong seq ) {
+ return (seq&1) ? buf+buf->buf1 : buf+buf->buf0;
+}
+
+/* fd_dbl_buf_insert appends a message to the double buffer.
+
+ Note: It is NOT safe to call this function from multiple threads. */
+
+void
+fd_dbl_buf_insert( fd_dbl_buf_t * buf,
+ void const * msg,
+ ulong sz );
+
+/* fd_dbl_buf_try_read does a speculative read the most recent message
+ (from the caller's POV). The read may be overrun by a writer. out
+ points to a buffer of fd_dbl_buf_obj_mtu(buf) bytes. opt_seqp points to
+ a ulong or NULL.
+
+ On success:
+ - returns the size of the message read
+ - a copy of the message is stored at out
+ - *opt_seqp is set to the msg sequence number (if non-NULL)
+
+ On failure (due to overrun):
+ - returns ULONG_MAX
+ - out buffer is clobbered
+ - *opt_seq is clobbered (if non-NULL) */
+
+static inline ulong
+fd_dbl_buf_try_read( fd_dbl_buf_t * buf,
+ void * out,
+ ulong * opt_seqp ) {
+ ulong seq = fd_dbl_buf_seq_query( buf );
+ void * src = fd_dbl_buf_slot( buf, seq );
+ ulong sz = FD_VOLATILE_CONST( buf->sz );
+ fd_memcpy( out, src, sz );
+ if( FD_UNLIKELY( seq!=fd_dbl_buf_seq_query( buf ) ) ) return ULONG_MAX;
+ fd_ulong_store_if( !!opt_seqp, opt_seqp, seq );
+ return sz;
+}
+
+/* fd_dbl_buf_read does a blocking */
+
+ulong
+fd_dbl_buf_read( fd_dbl_buf_t * buf,
+ void * obj,
+ ulong * opt_seqp );
+
+FD_PROTOTYPES_END
+
+#endif /* HEADER_fd_src_waltz_mib_fd_dbl_buf_h */
diff --git a/src/waltz/mib/fd_netdev_netlink.c b/src/waltz/mib/fd_netdev_netlink.c
new file mode 100644
index 0000000000..99ce900d13
--- /dev/null
+++ b/src/waltz/mib/fd_netdev_netlink.c
@@ -0,0 +1,221 @@
+#include "fd_netdev_netlink.h"
+#include "../../util/fd_util.h"
+#include "fd_netdev_tbl.h"
+
+#if !defined(__linux__)
+#error "fd_fib4_netlink.c requires a Linux system with kernel headers"
+#endif
+
+#include
+#include /* IFNAMSIZ */
+#include /* ARPHRD_NETROM */
+#include /* RTM_{...}, NLM_{...} */
+
+static fd_netdev_t *
+fd_netdev_init( fd_netdev_t * netdev ) {
+ *netdev = (fd_netdev_t) {
+ .mtu = 1500,
+ .if_idx = 0,
+ .slave_tbl_idx = -1,
+ .master_idx = -1,
+ .oper_status = FD_OPER_STATUS_INVALID
+ };
+ return netdev;
+}
+
+FD_FN_CONST static uchar
+ifoper_to_oper_status( uint if_oper ) {
+ /* Linux uses different enum values than RFC 2863 */
+ switch( if_oper ) {
+ case IF_OPER_UNKNOWN:
+ return FD_OPER_STATUS_UNKNOWN;
+ case IF_OPER_NOTPRESENT:
+ return FD_OPER_STATUS_NOT_PRESENT;
+ case IF_OPER_DOWN:
+ return FD_OPER_STATUS_DOWN;
+ case IF_OPER_LOWERLAYERDOWN:
+ return FD_OPER_STATUS_LOWER_LAYER_DOWN;
+ case IF_OPER_TESTING:
+ return FD_OPER_STATUS_TESTING;
+ case IF_OPER_DORMANT:
+ return FD_OPER_STATUS_DORMANT;
+ case IF_OPER_UP:
+ return FD_OPER_STATUS_UP;
+ default:
+ return FD_OPER_STATUS_INVALID;
+ }
+}
+
+int
+fd_netdev_netlink_load_table( fd_netdev_tbl_join_t * tbl,
+ fd_netlink_t * netlink ) {
+
+ fd_netdev_tbl_reset( tbl );
+
+ uint seq = netlink->seq++;
+
+ struct {
+ struct nlmsghdr nlh;
+ struct ifinfomsg ifi;
+ } request;
+ request.nlh = (struct nlmsghdr) {
+ .nlmsg_type = RTM_GETLINK,
+ .nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP,
+ .nlmsg_len = sizeof(request),
+ .nlmsg_seq = seq
+ };
+ request.ifi = (struct ifinfomsg) {
+ .ifi_family = AF_PACKET,
+ .ifi_type = ARPHRD_NETROM,
+ };
+
+ long send_res = sendto( netlink->fd, &request, sizeof(request), 0, NULL, 0 );
+ if( FD_UNLIKELY( send_res<0 ) ) {
+ FD_LOG_WARNING(( "netlink send(%d,RTM_GETLINK,NLM_F_REQUEST|NLM_F_DUMP) failed (%i-%s)", netlink->fd, errno, fd_io_strerror( errno ) ));
+ return errno;
+ }
+ if( FD_UNLIKELY( send_res!=sizeof(request) ) ) {
+ FD_LOG_WARNING(( "netlink send(%d,RTM_GETLINK,NLM_F_REQUEST|NLM_F_DUMP) failed (short write)", netlink->fd ));
+ return EPIPE;
+ }
+
+ int err = 0;
+
+ uchar buf[ 4096 ];
+ fd_netlink_iter_t iter[1];
+ for( fd_netlink_iter_init( iter, netlink, buf, sizeof(buf) );
+ !fd_netlink_iter_done( iter );
+ fd_netlink_iter_next( iter, netlink ) ) {
+ struct nlmsghdr const * nlh = fd_netlink_iter_msg( iter );
+ if( FD_UNLIKELY( nlh->nlmsg_type==NLMSG_ERROR ) ) {
+ struct nlmsgerr * err = NLMSG_DATA( nlh );
+ int nl_err = -err->error;
+ FD_LOG_WARNING(( "netlink RTM_GETLINK,NLM_F_REQUEST|NLM_F_DUMP failed (%d-%s)", nl_err, fd_io_strerror( nl_err ) ));
+ return nl_err;
+ }
+ if( FD_UNLIKELY( nlh->nlmsg_type!=RTM_NEWLINK ) ) {
+ FD_LOG_DEBUG(( "unexpected nlmsg_type %u", nlh->nlmsg_type ));
+ continue;
+ }
+ struct ifinfomsg const * ifi = NLMSG_DATA( nlh );
+
+ if( FD_UNLIKELY( ifi->ifi_index<0 || ifi->ifi_index>=tbl->hdr->dev_max ) ) {
+ FD_LOG_WARNING(( "Error reading interface table: interface %d is beyond max of %u", ifi->ifi_index, tbl->hdr->dev_max ));
+ err = ENOSPC;
+ break;
+ }
+ if( ifi->ifi_type!=ARPHRD_ETHER && ifi->ifi_type!=ARPHRD_LOOPBACK ) continue;
+
+ struct ifinfomsg * msg = NLMSG_DATA( nlh );
+ struct rtattr * rat = (void *)( (ulong)msg + NLMSG_ALIGN( sizeof(struct ifinfomsg) ) );
+ long rat_sz = (long)nlh->nlmsg_len - (long)NLMSG_ALIGN( sizeof(struct ifinfomsg) );
+
+ fd_netdev_t netdev[1];
+ fd_netdev_init( netdev );
+
+ for( ; RTA_OK( rat, rat_sz ); rat=RTA_NEXT( rat, rat_sz ) ) {
+ void * rta = RTA_DATA( rat );
+ ulong rta_sz = RTA_PAYLOAD( rat );
+
+ switch( rat->rta_type ) {
+
+ case IFLA_IFNAME:
+ if( FD_UNLIKELY( rta_sz==0 || rta_sz>=IFNAMSIZ ) ) {
+ FD_LOG_WARNING(( "Error reading interface table: IFLA_IFNAME has unsupported size %lu", rta_sz ));
+ err = EPROTO;
+ goto fail;
+ }
+ memcpy( netdev->name, rta, rta_sz );
+ netdev->name[ rta_sz ] = '\0';
+ break;
+
+ case IFLA_ADDRESS:
+ if( FD_UNLIKELY( rta_sz==6UL ) ) {
+ memcpy( netdev->mac_addr, rta, 6 );
+ }
+ break;
+
+ case IFLA_OPERSTATE:
+ if( FD_UNLIKELY( rta_sz!=1UL ) ) {
+ FD_LOG_WARNING(( "Error reading interface table: IFLA_OPERSTATE has unexpected size %lu", rta_sz ));
+ err = EPROTO;
+ goto fail;
+ }
+ netdev->oper_status = (uchar)ifoper_to_oper_status( FD_LOAD( uchar, rta ) );
+ break;
+
+ case IFLA_MTU:
+ if( FD_UNLIKELY( rta_sz!=4UL ) ) {
+ FD_LOG_WARNING(( "Error reading interface table: IFLA_MTU has unexpected size %lu", rta_sz ));
+ err = EPROTO;
+ goto fail;
+ }
+ netdev->mtu = (ushort)fd_uint_min( FD_LOAD( uint, rta ), USHORT_MAX );
+ break;
+
+ case IFLA_MASTER: {
+ if( FD_UNLIKELY( rta_sz!=4UL ) ) {
+ FD_LOG_WARNING(( "Error reading interface table: IFLA_MASTER has unexpected size %lu", rta_sz ));
+ err = EPROTO;
+ goto fail;
+ }
+ int master_idx = FD_LOAD( int, rta );
+ if( FD_UNLIKELY( master_idx<0 || master_idx>=tbl->hdr->dev_max ) ) {
+ FD_LOG_WARNING(( "Error reading interface table: IFLA_MASTER has invalid index %d", master_idx ));
+ err = EPROTO;
+ goto fail;
+ }
+ netdev->master_idx = (short)master_idx;
+ break;
+ }
+
+ } /* switch( rat->rta_type ) */
+ } /* for each RTA */
+
+ if( ifi->ifi_type==ARPHRD_LOOPBACK ) {
+ netdev->oper_status = FD_OPER_STATUS_UP;
+ }
+
+ tbl->dev_tbl[ ifi->ifi_index ] = *netdev;
+ tbl->hdr->dev_cnt = (ushort)fd_uint_max( tbl->hdr->dev_cnt, (uint)ifi->ifi_index+1U );
+ }
+
+ /* Walk the table again to index the bond master => slave mapping */
+
+ for( ulong j=0UL; j<(tbl->hdr->dev_cnt); j++ ) {
+ /* Only consider UP slaves */
+ if( tbl->dev_tbl[ j ].oper_status!=FD_OPER_STATUS_UP ) continue;
+
+ /* Find master */
+ int master_idx = tbl->dev_tbl[ j ].master_idx;
+ if( master_idx<0 ) continue;
+ if( FD_UNLIKELY( master_idx>=tbl->hdr->dev_max ) ) continue; /* unreachable */
+ fd_netdev_t * master = &tbl->dev_tbl[ master_idx ];
+
+ /* Allocate a new bond slave table if needed */
+ if( master->slave_tbl_idx<0 ) {
+ if( FD_UNLIKELY( tbl->hdr->bond_cnt>=tbl->hdr->bond_max ) ) {
+ FD_LOG_WARNING(( "Error reading interface table: Found %u bond devices but max is %u", tbl->hdr->bond_cnt, tbl->hdr->bond_max ));
+ continue;
+ }
+
+ master->slave_tbl_idx = (short)tbl->hdr->bond_cnt;
+ tbl->hdr->bond_cnt = (ushort)( tbl->hdr->bond_cnt+1U );
+ /* Assume that this table is empty */
+ }
+
+ fd_netdev_bond_t * bond = &tbl->bond_tbl[ master->slave_tbl_idx ];
+ if( FD_UNLIKELY( bond->slave_cnt>=FD_NETDEV_BOND_SLAVE_MAX ) ) {
+ FD_LOG_WARNING(( "Error reading interface table: Bond device %d has %u slaves but max is %d", master_idx, bond->slave_cnt, FD_NETDEV_BOND_SLAVE_MAX ));
+ continue;
+ }
+ bond->slave_idx[ bond->slave_cnt ] = (ushort)j;
+ bond->slave_cnt = (uchar)( bond->slave_cnt+1U );
+ }
+
+ return 0;
+
+fail:
+ fd_netlink_iter_drain( iter, netlink );
+ return err;
+}
diff --git a/src/waltz/mib/fd_netdev_netlink.h b/src/waltz/mib/fd_netdev_netlink.h
new file mode 100644
index 0000000000..67b0f61759
--- /dev/null
+++ b/src/waltz/mib/fd_netdev_netlink.h
@@ -0,0 +1,17 @@
+/* fd_netdev_netlink.h provides APIs for importing network interfaces from
+ Linux netlink. */
+
+#if defined(__linux__)
+
+#include "fd_netdev_tbl.h"
+#include "../ip/fd_netlink1.h"
+
+FD_PROTOTYPES_BEGIN
+
+int
+fd_netdev_netlink_load_table( fd_netdev_tbl_join_t * tbl,
+ fd_netlink_t * netlink );
+
+FD_PROTOTYPES_END
+
+#endif /* defined(__linux__) */
diff --git a/src/waltz/mib/fd_netdev_tbl.c b/src/waltz/mib/fd_netdev_tbl.c
new file mode 100644
index 0000000000..f8619c4ecf
--- /dev/null
+++ b/src/waltz/mib/fd_netdev_tbl.c
@@ -0,0 +1,192 @@
+#include "fd_netdev_tbl.h"
+#include "../../util/fd_util.h"
+
+struct fd_netdev_tbl_private {
+ ulong magic;
+ ulong dev_off;
+ ulong bond_off;
+ fd_netdev_tbl_hdr_t hdr;
+};
+
+FD_FN_CONST ulong
+fd_netdev_tbl_align( void ) {
+ return FD_NETDEV_TBL_ALIGN;
+}
+
+ulong
+fd_netdev_tbl_footprint( ulong dev_max,
+ ulong bond_max ) {
+ if( FD_UNLIKELY( dev_max ==0UL || dev_max >USHORT_MAX ) ) return 0UL;
+ if( FD_UNLIKELY( bond_max==0UL || bond_max>USHORT_MAX ) ) return 0UL;
+ return FD_LAYOUT_FINI( FD_LAYOUT_APPEND( FD_LAYOUT_APPEND( FD_LAYOUT_APPEND( FD_LAYOUT_INIT, \
+ alignof(fd_netdev_tbl_t), sizeof(fd_netdev_tbl_t) ), \
+ alignof(fd_netdev_t), sizeof(fd_netdev_t) * dev_max ), \
+ alignof(fd_netdev_bond_t), sizeof(fd_netdev_bond_t) * bond_max ), \
+ FD_NETDEV_TBL_ALIGN );
+}
+
+void *
+fd_netdev_tbl_new( void * shmem,
+ ulong dev_max,
+ ulong bond_max ) {
+
+ if( FD_UNLIKELY( !shmem ) ) {
+ FD_LOG_WARNING(( "NULL shmem" ));
+ return NULL;
+ }
+
+ if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)shmem, FD_NETDEV_TBL_ALIGN ) ) ) {
+ FD_LOG_WARNING(( "misaligned shmem" ));
+ return NULL;
+ }
+
+ if( FD_UNLIKELY( !dev_max || dev_max>USHORT_MAX ) ) {
+ FD_LOG_WARNING(( "invalid dev_max" ));
+ return NULL;
+ }
+
+ if( FD_UNLIKELY( !bond_max || bond_max>USHORT_MAX ) ) {
+ FD_LOG_WARNING(( "invalid bond_max" ));
+ return NULL;
+ }
+
+ FD_SCRATCH_ALLOC_INIT( l, shmem );
+ fd_netdev_tbl_t * tbl = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_netdev_tbl_t), sizeof(fd_netdev_tbl_t) );
+ fd_netdev_t * dev = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_netdev_t), sizeof(fd_netdev_t) * dev_max );
+ fd_netdev_bond_t * bond = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_netdev_bond_t), sizeof(fd_netdev_bond_t) * bond_max );
+ FD_SCRATCH_ALLOC_FINI( l, FD_NETDEV_TBL_ALIGN );
+
+ *tbl = (fd_netdev_tbl_t) {
+ .magic = FD_NETDEV_TBL_MAGIC,
+ .dev_off = (ulong)dev - (ulong)tbl,
+ .bond_off = (ulong)bond - (ulong)tbl,
+ .hdr = {
+ .dev_max = (ushort)dev_max,
+ .bond_max = (ushort)bond_max,
+ .dev_cnt = 0,
+ .bond_cnt = 0,
+ }
+ };
+
+ fd_netdev_tbl_join_t join[1];
+ fd_netdev_tbl_join( join, shmem );
+ fd_netdev_tbl_reset( join );
+ fd_netdev_tbl_leave( join );
+
+ return tbl;
+}
+
+fd_netdev_tbl_join_t *
+fd_netdev_tbl_join( void * ljoin,
+ void * shtbl ) {
+
+ if( FD_UNLIKELY( !shtbl ) ) {
+ FD_LOG_WARNING(( "NULL shtbl" ));
+ return NULL;
+ }
+
+ fd_netdev_tbl_join_t * join = ljoin;
+ fd_netdev_tbl_t * tbl = shtbl;
+
+ if( FD_UNLIKELY( tbl->magic!=FD_NETDEV_TBL_MAGIC ) ) {
+ FD_LOG_WARNING(( "bad magic" ));
+ return NULL;
+ }
+
+ *join = (fd_netdev_tbl_join_t) {
+ .hdr = &tbl->hdr,
+ .dev_tbl = (fd_netdev_t *)( (ulong)tbl + tbl->dev_off ),
+ .bond_tbl = (fd_netdev_bond_t *)( (ulong)tbl + tbl->bond_off ),
+ };
+
+ return join;
+}
+
+void *
+fd_netdev_tbl_leave( fd_netdev_tbl_join_t * join ) {
+ return join;
+}
+
+void *
+fd_netdev_tbl_delete( void * shtbl ) {
+
+ if( FD_UNLIKELY( !shtbl ) ) {
+ FD_LOG_WARNING(( "NULL shtbl" ));
+ return NULL;
+ }
+
+ fd_netdev_tbl_t * tbl = shtbl;
+ tbl->magic = 0UL;
+ return tbl;
+}
+
+void
+fd_netdev_tbl_reset( fd_netdev_tbl_join_t * tbl ) {
+ tbl->hdr->dev_cnt = 0;
+ tbl->hdr->bond_cnt = 0;
+ for( ulong j=0UL; j<(tbl->hdr->dev_max); j++ ) {
+ tbl->dev_tbl[j] = (fd_netdev_t) {
+ .master_idx = -1,
+ .slave_tbl_idx = -1
+ };
+ }
+ fd_memset( tbl->bond_tbl, 0, sizeof(fd_netdev_bond_t) * tbl->hdr->bond_max );
+}
+
+#if FD_HAS_HOSTED
+
+#include
+#include
+#include "../../util/net/fd_eth.h"
+
+#define WRAP_PRINT(file,str) if( FD_UNLIKELY( fputs( (str), (file) )<0 ) ) return errno
+#define WRAP_PRINTF(file,...) if( FD_UNLIKELY( fprintf( (file), __VA_ARGS__ )<0 ) ) return errno
+
+int
+fd_netdev_tbl_fprintf( fd_netdev_tbl_join_t const * tbl,
+ void * file_ ) {
+ FILE * file = file_;
+ for( ulong j=0UL; j<(tbl->hdr->dev_cnt); j++ ) {
+ fd_netdev_t const * dev = &tbl->dev_tbl[j];
+ if( !dev->oper_status ) continue;
+ WRAP_PRINTF( file,
+ "%lu: %s: mtu %u state (%i-%s)",
+ j, dev->name, dev->mtu,
+ dev->oper_status, fd_oper_status_cstr( dev->oper_status ) );
+ if( dev->slave_tbl_idx>=0 ) {
+ WRAP_PRINT( file, " master" );
+ }
+ WRAP_PRINTF( file,
+ "\n link " FD_ETH_MAC_FMT "\n",
+ FD_ETH_MAC_FMT_ARGS( dev->mac_addr ) );
+ if( dev->slave_tbl_idx>=0 && tbl->bond_tbl[ dev->slave_tbl_idx ].slave_cnt ) {
+ fd_netdev_bond_t * bond = &tbl->bond_tbl[ dev->slave_tbl_idx ];
+ WRAP_PRINTF( file, " slaves (%u):", bond->slave_cnt );
+ for( ulong k=0UL; k<(bond->slave_cnt); k++ ) {
+ WRAP_PRINTF( file, " %u-%s", bond->slave_idx[k], tbl->dev_tbl[ bond->slave_idx[k] ].name );
+ }
+ WRAP_PRINT( file, "\n" );
+ }
+ }
+ return 0;
+}
+
+#undef WRAP_PRINT
+#undef WRAP_PRINTF
+
+#endif /* FD_HAS_HOSTED */
+
+char const *
+fd_oper_status_cstr( uint oper_status ) {
+ switch( oper_status ) {
+ case FD_OPER_STATUS_UP: return "up";
+ case FD_OPER_STATUS_DOWN: return "down";
+ case FD_OPER_STATUS_TESTING: return "testing";
+ case FD_OPER_STATUS_DORMANT: return "dormant";
+ case FD_OPER_STATUS_NOT_PRESENT: return "not present";
+ case FD_OPER_STATUS_LOWER_LAYER_DOWN: return "lower layer down";
+ case FD_OPER_STATUS_UNKNOWN: /* fallthrough */
+ default:
+ return "unknown";
+ }
+}
diff --git a/src/waltz/mib/fd_netdev_tbl.h b/src/waltz/mib/fd_netdev_tbl.h
new file mode 100644
index 0000000000..6b36f4ef42
--- /dev/null
+++ b/src/waltz/mib/fd_netdev_tbl.h
@@ -0,0 +1,147 @@
+#ifndef HEADER_fd_src_waltz_mib_fd_netdev_h
+#define HEADER_fd_src_waltz_mib_fd_netdev_h
+
+/* fd_netdev_tbl.h provides a network interface table.
+ The entrypoint of this API is fd_netlink_tbl_t. */
+
+#include "../../util/fd_util_base.h"
+
+/* FD_OPER_STATUS_* give the operational state of a network interface.
+ See RFC 2863 Section 3.1.14: https://datatracker.ietf.org/doc/html/rfc2863#section-3.1.14 */
+
+#define FD_OPER_STATUS_INVALID (0)
+#define FD_OPER_STATUS_UP (1) /* ready to pass packets */
+#define FD_OPER_STATUS_DOWN (2)
+#define FD_OPER_STATUS_TESTING (3) /* in some test mode */
+#define FD_OPER_STATUS_UNKNOWN (4) /* status can not be determined */
+#define FD_OPER_STATUS_DORMANT (5)
+#define FD_OPER_STATUS_NOT_PRESENT (6) /* some component is missing */
+#define FD_OPER_STATUS_LOWER_LAYER_DOWN (7) /* down due to state of lower-layer interface(s) */
+
+/* fd_netdev_t holds basic configuration of a network device. */
+
+struct fd_netdev {
+ ushort mtu; /* Largest layer-3 payload that fits in a packet */
+ uchar mac_addr[6]; /* MAC address */
+ ushort if_idx; /* Interface index */
+ short slave_tbl_idx; /* index to bond slave table, -1 if not a bond master */
+ short master_idx; /* index of bond master, -1 if not a bond slave */
+ char name[16]; /* cstr interface name (max 15 length) */
+ uchar oper_status; /* one of FD_OPER_STATUS_{...} */
+ uchar pad[1];
+ /* padded to 32 bytes */
+};
+
+typedef struct fd_netdev fd_netdev_t;
+
+/* FD_NETDEV_BOND_SLAVE_MAX is the max supported number of bond slaves. */
+
+#define FD_NETDEV_BOND_SLAVE_MAX (16)
+
+/* fd_netdev_bond_t lists active slaves of a bond device. */
+
+struct fd_netdev_bond {
+ uchar slave_cnt;
+ ushort slave_idx[ FD_NETDEV_BOND_SLAVE_MAX ];
+};
+
+typedef struct fd_netdev_bond fd_netdev_bond_t;
+
+/* fd_netdev_tbl_t provides an interface table.
+
+ This table is optimized for frequent reads and rare writes. It is
+ generally not thread-safe to modify the table in-place. The only safe
+ way to sync modifications to other threads is by copying the table in
+ its entirety. */
+
+struct fd_netdev_tbl_private;
+typedef struct fd_netdev_tbl_private fd_netdev_tbl_t;
+
+struct fd_netdev_tbl_hdr {
+ ushort dev_max;
+ ushort bond_max;
+ ushort dev_cnt;
+ ushort bond_cnt;
+};
+typedef struct fd_netdev_tbl_hdr fd_netdev_tbl_hdr_t;
+
+struct fd_netdev_tbl_join {
+ fd_netdev_tbl_hdr_t * hdr;
+ fd_netdev_t * dev_tbl;
+ fd_netdev_bond_t * bond_tbl;
+};
+typedef struct fd_netdev_tbl_join fd_netdev_tbl_join_t;
+
+#define FD_NETDEV_TBL_MAGIC (0xd5f9ba2710d6bf0aUL) /* random */
+
+/* FD_NETDEV_TBL_ALIGN is the return value of fd_netdev_tbl_align() */
+
+#define FD_NETDEV_TBL_ALIGN (16UL)
+
+FD_PROTOTYPES_BEGIN
+
+/* fd_netdev_tbl_{align,footprint} describe a memory region suitable to
+ back a netdev_tbl with dev_max interfaces and bond_max bond masters. */
+
+FD_FN_CONST ulong
+fd_netdev_tbl_align( void );
+
+ulong
+fd_netdev_tbl_footprint( ulong dev_max,
+ ulong bond_max );
+
+/* fd_netdev_tbl_new formats a memory region as an empty netdev_tbl.
+ Returns shmem on success. On failure returns NULL and logs reason for
+ failure. */
+
+void *
+fd_netdev_tbl_new( void * shmem,
+ ulong dev_max,
+ ulong bond_max );
+
+/* fd_netdev_tbl_join joins a netdev_tbl at shtbl. ljoin points to a
+ fd_netdev_tbl_join_t[1] to which object information is written to.
+ Returns ljoin on success. On failure, returns NULL and logs reason for
+ failure. */
+
+fd_netdev_tbl_join_t *
+fd_netdev_tbl_join( void * ljoin,
+ void * shtbl );
+
+/* fd_netdev_tbl_leave undoes a fd_netdev_tbl_join. Returns ownership
+ of the region backing join to the caller. (Warning: This returns ljoin,
+ not shtbl) */
+
+void *
+fd_netdev_tbl_leave( fd_netdev_tbl_join_t * join );
+
+/* fd_netdev_tbl_delete unformats the memory region backing a netdev_tbl
+ and returns ownership of the region back to the caller. */
+
+void *
+fd_netdev_tbl_delete( void * shtbl );
+
+/* fd_netdev_tbl_reset resets the table to the state of a newly constructed
+ empty object (clears all devices and bonds). */
+
+void
+fd_netdev_tbl_reset( fd_netdev_tbl_join_t * tbl );
+
+#if FD_HAS_HOSTED
+
+/* fd_netdev_tbl_fprintf prints the interface table to the given FILE *
+ pointer (or target equivalent). Outputs ASCII encoding with LF
+ newlines. Returns errno on failure and 0 on success. */
+
+int
+fd_netdev_tbl_fprintf( fd_netdev_tbl_join_t const * tbl,
+ void * file );
+
+#endif /* FD_HAS_HOSTED */
+
+FD_PROTOTYPES_END
+
+char const *
+fd_oper_status_cstr( uint oper_status );
+
+#endif /* HEADER_fd_src_waltz_mib_fd_netdev_h */
diff --git a/src/waltz/mib/test_netdev_netlink.c b/src/waltz/mib/test_netdev_netlink.c
new file mode 100644
index 0000000000..e25363592b
--- /dev/null
+++ b/src/waltz/mib/test_netdev_netlink.c
@@ -0,0 +1,61 @@
+#include
+#include "fd_netdev_netlink.h"
+#include "../../util/fd_util.h"
+
+int
+main( int argc,
+ char ** argv ) {
+ fd_boot( &argc, &argv );
+
+ ulong cpu_idx = fd_tile_cpu_id( fd_tile_idx() );
+ if( cpu_idx>fd_shmem_cpu_cnt() ) cpu_idx = 0UL;
+
+ char const * _page_sz = fd_env_strip_cmdline_cstr ( &argc, &argv, "--page-sz", NULL, "normal" );
+ ulong page_cnt = fd_env_strip_cmdline_ulong( &argc, &argv, "--page-cnt", NULL, 4096UL );
+ ulong numa_idx = fd_env_strip_cmdline_ulong( &argc, &argv, "--numa-idx", NULL, fd_shmem_numa_idx( cpu_idx ) );
+ ulong dev_cnt = fd_env_strip_cmdline_ulong( &argc, &argv, "--dev-cnt", NULL, 256UL );
+ ulong bond_cnt = fd_env_strip_cmdline_ulong( &argc, &argv, "--bond-cnt", NULL, 4UL );
+
+ ulong page_sz = fd_cstr_to_shmem_page_sz( _page_sz );
+ if( FD_UNLIKELY( !page_sz ) ) FD_LOG_ERR(( "unsupported --page-sz" ));
+
+ if( FD_UNLIKELY( !dev_cnt ) ) FD_LOG_ERR(( "unsupported --dev-cnt" ));
+ if( FD_UNLIKELY( !bond_cnt ) ) FD_LOG_ERR(( "unsupported --bond-cnt" ));
+
+ FD_LOG_NOTICE(( "Creating workspace (--page-cnt %lu, --page-sz %s, --numa-idx %lu)", page_cnt, _page_sz, numa_idx ));
+ fd_wksp_t * wksp = fd_wksp_new_anonymous( fd_cstr_to_shmem_page_sz( _page_sz ), page_cnt, fd_shmem_cpu_idx( numa_idx ), "wksp", 0UL );
+ FD_TEST( wksp );
+
+ ulong tbl_fp = fd_netdev_tbl_footprint( dev_cnt, bond_cnt );
+ if( FD_UNLIKELY( !tbl_fp ) ) {
+ FD_LOG_ERR(( "Invalid --dev-cnt or --page-cnt" ));
+ }
+ void * tbl_mem = fd_wksp_alloc_laddr( wksp, fd_netdev_tbl_align(), tbl_fp, 1UL );
+ FD_TEST( tbl_mem );
+
+ FD_TEST( fd_netdev_tbl_new( tbl_mem, dev_cnt, bond_cnt )==tbl_mem );
+ fd_netdev_tbl_join_t tbl[1];
+ FD_TEST( fd_netdev_tbl_join( tbl, tbl_mem )==tbl );
+
+ fd_netlink_t _netlink[1];
+ fd_netlink_t * netlink = fd_netlink_init( _netlink, 42U );
+ FD_TEST( netlink );
+
+ int ld_err = fd_netdev_netlink_load_table( tbl, netlink );
+ if( FD_UNLIKELY( ld_err ) ) {
+ FD_LOG_WARNING(( "Failed to load interfaces (error code %i)", ld_err ));
+ }
+ FD_LOG_NOTICE(( "Dumping interface table" ));
+ fd_log_flush();
+ fd_netdev_tbl_fprintf( tbl, stderr );
+ fflush( stderr );
+
+ fd_netlink_fini( netlink );
+ fd_netdev_tbl_leave( tbl );
+ fd_wksp_free_laddr( fd_netdev_tbl_delete( tbl_mem ) );
+ fd_wksp_delete_anonymous( wksp );
+
+ FD_LOG_NOTICE(( "pass" ));
+ fd_halt();
+ return 0;
+}
diff --git a/src/waltz/neigh/Local.mk b/src/waltz/neigh/Local.mk
new file mode 100644
index 0000000000..0cba1dca5f
--- /dev/null
+++ b/src/waltz/neigh/Local.mk
@@ -0,0 +1,9 @@
+$(call add-hdrs,fd_neigh4_map.h fd_neigh4_map_defines.h)
+$(call add-objs,fd_neigh4_map,fd_waltz)
+ifdef FD_HAS_LINUX
+ifdef FD_HAS_SSE
+$(call add-hdrs,fd_neigh4_netlink.h)
+$(call add-objs,fd_neigh4_netlink,fd_waltz)
+$(call make-unit-test,test_neigh4_netlink,test_neigh4_netlink,fd_waltz fd_util)
+endif
+endif
diff --git a/src/waltz/neigh/fd_neigh4_map.c b/src/waltz/neigh/fd_neigh4_map.c
new file mode 100644
index 0000000000..7631b1dfac
--- /dev/null
+++ b/src/waltz/neigh/fd_neigh4_map.c
@@ -0,0 +1,41 @@
+/* Include fd_neigh4_map prototypes */
+#include "fd_neigh4_map.h"
+
+/* Generate fd_neigh4_map definitions */
+#include "fd_neigh4_map_defines.h"
+#define MAP_IMPL_STYLE 2
+#include "../../util/tmpl/fd_map_slot_para.c"
+
+#if FD_HAS_HOSTED && FD_HAS_SSE
+
+#include
+#include
+#include
+#include "../../util/net/fd_ip4.h"
+#include "../../util/net/fd_eth.h"
+
+int
+fd_neigh4_hmap_fprintf( fd_neigh4_hmap_t const * map,
+ void * file_ ) {
+ FILE * file = file_;
+
+ ulong ele_max = fd_neigh4_hmap_ele_max( map );
+ fd_neigh4_entry_t const * ele = fd_neigh4_hmap_shele_const( map );
+
+ for( ulong j=0UL; j
+#include /* AF_INET */
+#include /* struct nlmsghdr */
+#include /* RTM_NEWNEIGH */
+#include /* struct ndmsg */
+#include "../ip/fd_netlink1.h"
+#include "../../util/fd_util.h"
+#include "../../util/net/fd_ip4.h"
+#include "fd_neigh4_map.h"
+
+int
+fd_neigh4_netlink_request_dump( fd_netlink_t * netlink,
+ uint if_idx ) {
+
+ uint seq = netlink->seq++;
+
+ struct {
+ struct nlmsghdr nlh;
+ struct ndmsg ndm;
+ } request;
+ request.nlh = (struct nlmsghdr) {
+ .nlmsg_type = RTM_GETNEIGH,
+ .nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP,
+ .nlmsg_len = sizeof(request),
+ .nlmsg_seq = seq
+ };
+ request.ndm = (struct ndmsg) {
+ .ndm_family = AF_INET,
+ .ndm_ifindex = (int)if_idx
+ };
+
+ long send_res = send( netlink->fd, &request, sizeof(request), 0 );
+ if( FD_UNLIKELY( send_res<0 ) ) {
+ FD_LOG_WARNING(( "netlink send(RTM_GETNEIGH,NLM_F_REQUEST|NLM_F_DUMP) failed (%d-%s)", errno, fd_io_strerror( errno ) ));
+ return errno;
+ }
+ if( FD_UNLIKELY( send_res!=sizeof(request ) ) ) {
+ FD_LOG_WARNING(( "netlink send(RTM_GETROUTE,NLM_F_REQUEST|NLM_F_DUMP) failed (short write)" ));
+ return EPIPE;
+ }
+
+ return 0;
+}
+
+void
+fd_neigh4_netlink_ingest_message( fd_neigh4_hmap_t * map,
+ struct nlmsghdr const * msg_hdr,
+ uint if_idx ) {
+ if( FD_UNLIKELY( msg_hdr->nlmsg_type!=RTM_NEWNEIGH && msg_hdr->nlmsg_type!=RTM_DELNEIGH ) ) {
+ FD_LOG_WARNING(( "unexpected nlmsg_type %u", msg_hdr->nlmsg_type ));
+ }
+
+ struct ndmsg const * ndm = NLMSG_DATA( msg_hdr );
+ struct rtattr const * rat = RTM_RTA( ndm );
+ long rat_sz = (long)(int)RTM_PAYLOAD( msg_hdr );
+
+ if( FD_UNLIKELY( ndm->ndm_family!=AF_INET ) ) return;
+ if( FD_UNLIKELY( (uint)ndm->ndm_ifindex!=if_idx ) ) return;
+
+ uint ip4_dst = 0U;
+ union {
+ uchar u6[6];
+ ulong ul;
+ } mac_addr = {0};
+
+ for( ; RTA_OK( rat, rat_sz ); rat=RTA_NEXT( rat, rat_sz ) ) {
+
+ void * rta = RTA_DATA( rat );
+ ulong rta_sz = RTA_PAYLOAD( rat );
+
+ switch( rat->rta_type ) {
+
+ case NDA_DST:
+ if( FD_UNLIKELY( rta_sz!=4UL ) ) {
+ FD_LOG_WARNING(( "unexpected NDA_DST size %lu", rta_sz ));
+ return;
+ }
+ ip4_dst = FD_LOAD( uint, rta ); /* big endian */
+ break;
+
+ case NDA_LLADDR:
+ if( FD_UNLIKELY( rta_sz!=6UL ) ) {
+ FD_LOG_WARNING(( "unexpected NDA_LLADDR size %lu (is this an Ethernet interface?)", rta_sz ));
+ return;
+ }
+ memcpy( mac_addr.u6, rta, 6 );
+ break;
+
+ default:
+ break; /* ignore */
+ }
+
+ }
+
+ if( FD_UNLIKELY( !mac_addr.ul || !ip4_dst ) ) {
+ FD_LOG_DEBUG(( "Ignoring neighbor table update with missing or invalid L2 or L3 address" ));
+ return;
+ }
+
+ /* Determine if we should remove or insert/update entry */
+
+ int remove = 0;
+ switch( ndm->ndm_state ) {
+ case NUD_REACHABLE:
+ case NUD_STALE:
+ case NUD_DELAY:
+ case NUD_PROBE:
+ case NUD_PERMANENT:
+ remove = 0;
+ break;
+ default:
+ remove = 1;
+ break;
+ }
+ if( msg_hdr->nlmsg_type==RTM_DELNEIGH ) {
+ remove = 1;
+ }
+
+ /* Perform update */
+
+ if( remove ) {
+
+ fd_neigh4_hmap_remove( map, &ip4_dst, NULL, FD_MAP_FLAG_BLOCKING );
+
+ } else {
+
+ fd_neigh4_hmap_query_t query[1];
+ int prepare_res = fd_neigh4_hmap_prepare( map, &ip4_dst, NULL, query, FD_MAP_FLAG_BLOCKING );
+ if( FD_UNLIKELY( prepare_res!=FD_MAP_SUCCESS ) ) {
+ FD_LOG_WARNING(( "Failed to update neighbor table" ));
+ return;
+ }
+
+ fd_neigh4_entry_t * ele = fd_neigh4_hmap_query_ele( query );
+
+ ele->ip4_addr = ip4_dst;
+ memcpy( ele->mac_addr, mac_addr.u6, 6 );
+
+ fd_neigh4_hmap_publish( query );
+
+ }
+
+}
+
+int
+fd_neigh4_netlink_solicit( fd_netlink_t * netlink,
+ uint if_idx,
+ uint ip4_addr ) {
+
+ uint seq = netlink->seq++;
+
+ struct {
+ struct nlmsghdr nlh;
+ struct ndmsg ndm;
+ struct nlattr nla_dst;
+ uint dst_addr;
+ } request;
+ request.nlh = (struct nlmsghdr) {
+ .nlmsg_type = RTM_NEWNEIGH,
+ .nlmsg_flags = NLM_F_REQUEST|NLM_F_ACK|NLM_F_EXCL|NLM_F_CREATE,
+ .nlmsg_seq = seq
+ };
+ request.ndm = (struct ndmsg) {
+ .ndm_family = AF_INET,
+ .ndm_ifindex = (int)if_idx,
+ .ndm_state = NUD_INCOMPLETE, /* neighbor entry starts out as empty */
+ .ndm_flags = NTF_USE /* mark neighbor as used which triggers ARP request */
+ };
+ request.nla_dst = (struct nlattr) {
+ .nla_type = NDA_DST,
+ .nla_len = (ushort)( sizeof(struct nlattr) + fd_uint_align_up( sizeof(uint), NLA_ALIGNTO ) )
+ };
+ request.dst_addr = ip4_addr; /* big endian */
+
+ /* Send request */
+
+ long send_res = sendto( netlink->fd, &request, sizeof(request), 0, NULL, 0 );
+ if( FD_UNLIKELY( send_res<0 ) ) {
+ FD_LOG_WARNING(( "netlink send(RTM_NEWNEIGH,NLM_F_REQUEST|NLM_F_ACK|NLM_F_EXCL|NLM_F_CREATE," FD_IP4_ADDR_FMT ") failed (%d-%s)",
+ FD_IP4_ADDR_FMT_ARGS( ip4_addr ), errno, fd_io_strerror( errno ) ));
+ return errno;
+ }
+ if( FD_UNLIKELY( send_res!=sizeof(request) ) ) {
+ FD_LOG_WARNING(( "netlink send(RTM_NEWNEIGH,NLM_F_REQUEST|NLM_F_ACK|NLM_F_EXCL|NLM_F_CREATE," FD_IP4_ADDR_FMT ") failed (short write)",
+ FD_IP4_ADDR_FMT_ARGS( ip4_addr ) ));
+ return EPIPE;
+ }
+
+ /* Get error code */
+
+ for( ulong attempt=0UL; attempt<64UL; attempt++ ) {
+ uchar buf[ 4096 ];
+ long recv_res = fd_netlink_read_socket( netlink->fd, buf, sizeof(buf) );
+ if( FD_UNLIKELY( recv_res<0 ) ) {
+ FD_LOG_WARNING(( "netlink recv failed (%d-%s)", errno, fd_io_strerror( errno ) ));
+ return errno;
+ }
+
+ struct nlmsghdr const * nlh = fd_type_pun_const( buf );
+ if( FD_UNLIKELY( nlh->nlmsg_seq != seq ) ) {
+ /* Should only happen if caller misbehaves */
+ FD_LOG_WARNING(( "Dropping rtnetlink message type=%u seq=%u", nlh->nlmsg_type, nlh->nlmsg_seq ));
+ continue;
+ }
+
+ if( FD_UNLIKELY( nlh->nlmsg_type!=NLMSG_ERROR ) ) {
+ /* Should never happen */
+ FD_LOG_WARNING(( "unexpected nlmsg_type %u for RTM_NEWNEIGH request", nlh->nlmsg_type ));
+ continue;
+ }
+
+ struct nlmsgerr * err = NLMSG_DATA( nlh );
+ int nl_err = -err->error;
+ return nl_err;
+ }
+
+ FD_LOG_WARNING(( "Giving up on receiving response code for RTM_NEWNEIGH request" ));
+ return 0;
+}
diff --git a/src/waltz/neigh/fd_neigh4_netlink.h b/src/waltz/neigh/fd_neigh4_netlink.h
new file mode 100644
index 0000000000..f7f136e17e
--- /dev/null
+++ b/src/waltz/neigh/fd_neigh4_netlink.h
@@ -0,0 +1,50 @@
+/* fd_neigh4_netlink.h provides APIs for importing IPv4 neighbors from
+ Linux netlink. Assumes link-layer addresses are 6 bytes long. */
+
+#if defined(__linux__)
+
+#include "fd_neigh4_map.h"
+#include "../ip/fd_netlink1.h"
+
+struct nlmsghdr; /* forward declaration */
+
+/* FD_NEIGH_NETLINK_* gives error codes for netlink import operations. */
+
+FD_PROTOTYPES_BEGIN
+
+/* fd_neigh4_netlink_request_dump requests a dump of the IPv4 neighbor
+ table for the given interface index. The kernel typically responds with
+ multi-part messages. */
+
+int
+fd_neigh4_netlink_request_dump( fd_netlink_t * netlink,
+ uint if_idx );
+
+/* fd_neigh4_netlink_ingest_message imports an RTM_NEWNEIGH or RTM_DELNEIGH
+ message. Logs warning if a netlink message with a different type is
+ inserted. Logs warning if link-layer addresses is not 6 bytes long.
+ (The caller is expected to verify that if_idx is an Ethernet interface.)
+ Ignores messages with an interface index other than if_idx. Causes
+ insert, update, or remove of a neighbor table entry. Only respects
+ IPv4 neighbor entries. Silently ignores IPv6 neighbor entries. */
+
+void
+fd_neigh4_netlink_ingest_message( fd_neigh4_hmap_t * map,
+ struct nlmsghdr const * msg,
+ uint if_idx );
+
+/* fd_neigh4_netlink_solicit requests the kernel to create a new neighbor
+ table entry and start an ARP request for it. Uses sendto(2) syscall.
+ Immediately tries to recvfrom(2) the error code. Assumes that netlink
+ socket is not bound and has no buffered messages. Returns 0 on success
+ and netlink error code on failure. The most common reason for failure
+ is EEXIST (neighbor entry already exists). */
+
+int
+fd_neigh4_netlink_solicit( fd_netlink_t * netlink,
+ uint if_idx,
+ uint ip4_addr );
+
+FD_PROTOTYPES_END
+
+#endif /* defined(__linux__) */
diff --git a/src/waltz/neigh/test_neigh4_netlink.c b/src/waltz/neigh/test_neigh4_netlink.c
new file mode 100644
index 0000000000..fb1a3609dd
--- /dev/null
+++ b/src/waltz/neigh/test_neigh4_netlink.c
@@ -0,0 +1,155 @@
+#include "fd_neigh4_netlink.h"
+#include
+#include
+#include /* AF_PACKET */
+#include
+#include /* ARPHRD_ETHER */
+#include
+#include
+#include "../../util/fd_util.h"
+
+FD_STATIC_ASSERT( sizeof(fd_neigh4_entry_t)==16, layout );
+
+static void
+dump_neighbor_table( fd_neigh4_hmap_t * map,
+ fd_netlink_t * netlink1,
+ int if_idx ) {
+ fd_neigh4_netlink_request_dump( netlink1, (uint)if_idx );
+
+ uchar buf[ 4096 ];
+ fd_netlink_iter_t iter[1];
+ for( fd_netlink_iter_init( iter, netlink1, buf, sizeof(buf) );
+ !fd_netlink_iter_done( iter );
+ fd_netlink_iter_next( iter, netlink1 ) ) {
+ fd_neigh4_netlink_ingest_message( map, fd_netlink_iter_msg( iter ), (uint)if_idx );
+ }
+
+ char name[ IF_NAMESIZE ];
+ fprintf( stderr, "# ip neigh show dev %s\n", if_indextoname( (uint)if_idx, name ) );
+ fd_log_flush();
+ fd_neigh4_hmap_fprintf( map, stderr );
+ fputs( "\n", stderr );
+
+ /* Reinitialize table */
+
+ ulong ele_max = fd_neigh4_hmap_ele_max ( map );
+ ulong lock_cnt = fd_neigh4_hmap_lock_cnt ( map );
+ ulong probe_max = fd_neigh4_hmap_probe_max( map );
+ ulong seed = fd_neigh4_hmap_seed ( map );
+ void * shmap = fd_neigh4_hmap_shmap ( map );
+ void * shele = fd_neigh4_hmap_shele ( map );
+ void * ljoin = fd_neigh4_hmap_leave ( map );
+ fd_neigh4_hmap_delete( shmap );
+ fd_memset( shele, 0, ele_max*sizeof(fd_neigh4_entry_t) );
+ FD_TEST( fd_neigh4_hmap_new( shmap, ele_max, lock_cnt, probe_max, seed ) );
+ FD_TEST( fd_neigh4_hmap_join( ljoin, shmap, shele ) );
+}
+
+static void
+dump_all_neighbor_tables( fd_neigh4_hmap_t * map,
+ fd_netlink_t * netlink0,
+ fd_netlink_t * netlink1 ) {
+
+ /* List all network interfaces */
+
+ uint seq = netlink0->seq++;
+ struct {
+ struct nlmsghdr nlh;
+ struct ifinfomsg ifi;
+ } request;
+ request.nlh = (struct nlmsghdr){
+ .nlmsg_len = sizeof(request),
+ .nlmsg_type = RTM_GETLINK,
+ .nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP,
+ .nlmsg_seq = seq
+ };
+ request.ifi = (struct ifinfomsg){
+ .ifi_family = AF_PACKET,
+ .ifi_type = ARPHRD_ETHER
+ };
+
+ long send_res = send( netlink0->fd, &request, sizeof(request), 0);
+ if( FD_UNLIKELY( send_res<0 ) ) {
+ FD_LOG_ERR(( "netlink send(RTM_GETLINK,NLM_F_REQUEST|NLM_F_DUMP,ARPHRD_ETHER) failed (%d-%s)", errno, fd_io_strerror( errno ) ));
+ }
+ if( FD_UNLIKELY( send_res!=sizeof(request) ) ) {
+ FD_LOG_ERR(( "netlink send(RTM_GETLINK,NLM_F_REQUEST|NLM_F_DUMP,ARPHRD_ETHER) failed (short write)" ));
+ }
+
+ FD_LOG_NOTICE(( "Dumping neighbor tables for all Ethernet interfaces\n" ));
+ fd_log_flush();
+
+ uchar buf[ 4096 ];
+ fd_netlink_iter_t iter[1];
+ for( fd_netlink_iter_init( iter, netlink0, buf, sizeof(buf) );
+ !fd_netlink_iter_done( iter );
+ fd_netlink_iter_next( iter, netlink0 ) ) {
+ struct nlmsghdr const * nlh = fd_netlink_iter_msg( iter );
+ if( FD_UNLIKELY( nlh->nlmsg_type==NLMSG_ERROR ) ) {
+ struct nlmsgerr * err = NLMSG_DATA( nlh );
+ int nl_err = -err->error;
+ FD_LOG_ERR(( "netlink RTM_GETLINK,NLM_F_REQUEST|NLM_F_DUMP,ARPHRD_ETHER failed (%d-%s)", nl_err, fd_io_strerror( nl_err ) ));
+ }
+ if( FD_UNLIKELY( nlh->nlmsg_type!=RTM_NEWLINK ) ) {
+ FD_LOG_DEBUG(( "unexpected nlmsg_type %u", nlh->nlmsg_type ));
+ continue;
+ }
+ struct ifinfomsg const * ifi = NLMSG_DATA( nlh );
+
+ dump_neighbor_table( map, netlink1, ifi->ifi_index );
+ }
+
+}
+
+int
+main( int argc,
+ char ** argv ) {
+ fd_boot( &argc, &argv );
+
+ ulong cpu_idx = fd_tile_cpu_id( fd_tile_idx() );
+ if( cpu_idx>=fd_shmem_cpu_cnt() ) cpu_idx = 0UL;
+
+ char const * _page_sz = fd_env_strip_cmdline_cstr ( &argc, &argv, "--page-sz", NULL, "gigantic" );
+ ulong page_cnt = fd_env_strip_cmdline_ulong ( &argc, &argv, "--page-cnt", NULL, 1UL );
+ ulong numa_idx = fd_env_strip_cmdline_ulong ( &argc, &argv, "--numa-idx", NULL, fd_shmem_numa_idx(cpu_idx) );
+
+ ulong page_sz = fd_cstr_to_shmem_page_sz( _page_sz );
+ if( FD_UNLIKELY( !page_sz ) ) FD_LOG_ERR(( "unsupported --page-sz" ));
+
+ FD_LOG_NOTICE(( "Creating anonymous workspace with --page-cnt %lu --page-sz %s pages on --numa-idx %lu", page_cnt, _page_sz, numa_idx ));
+ fd_wksp_t * wksp = fd_wksp_new_anonymous( page_sz, page_cnt, fd_shmem_cpu_idx( numa_idx ), "wksp", 0UL );
+ FD_TEST( wksp );
+
+ fd_netlink_t _netlink[2];
+ fd_netlink_t * netlink0 = fd_netlink_init( _netlink+0, 42U );
+ fd_netlink_t * netlink1 = fd_netlink_init( _netlink+1, 999U );
+ FD_TEST( netlink0 );
+ FD_TEST( netlink1 );
+
+ ulong ele_max = 16384UL;
+ ulong lock_cnt = 4UL;
+ ulong probe_max = 16UL;
+ ulong seed = 42UL;
+ void * hmap_mem = fd_wksp_alloc_laddr( wksp, fd_neigh4_hmap_align(), fd_neigh4_hmap_footprint( ele_max, lock_cnt, probe_max ), 1UL );
+ void * ele_mem = fd_wksp_alloc_laddr( wksp, alignof(fd_neigh4_entry_t), ele_max*sizeof(fd_neigh4_entry_t), 1UL );
+ FD_TEST( hmap_mem ); FD_TEST( ele_mem );
+ FD_TEST( fd_neigh4_hmap_new( hmap_mem, ele_max, lock_cnt, probe_max, seed ) );
+
+ fd_neigh4_hmap_t _map[1];
+ fd_neigh4_hmap_t * map = fd_neigh4_hmap_join( _map, hmap_mem, ele_mem );
+ FD_TEST( map );
+
+ dump_all_neighbor_tables( map, netlink0, netlink1 );
+
+ fd_netlink_fini( netlink0 );
+ fd_netlink_fini( netlink1 );
+
+ fd_neigh4_hmap_leave( map );
+ fd_wksp_free_laddr( fd_neigh4_hmap_delete( hmap_mem ) );
+ fd_wksp_free_laddr( ele_mem );
+ fd_wksp_delete_anonymous( wksp );
+
+ FD_LOG_NOTICE(( "pass" ));
+ fd_halt();
+ return 0;
+}