Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
155 changes: 89 additions & 66 deletions scripts/route_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@
import traceback
import subprocess
import concurrent.futures
import ijson

from ipaddress import ip_network
from swsscommon import swsscommon
Expand Down Expand Up @@ -391,70 +390,68 @@ def is_suppress_fib_pending_enabled(namespace):

def fetch_routes(ipv6=False, namespace=multi_asic.DEFAULT_NAMESPACE):
"""
Fetch routes using the given command.
Uses ijson for streaming JSON parsing to handle large route tables efficiently.
Parses each prefix entry as soon as it becomes available instead of waiting for the entire JSON.
"""
Fetch non-offloaded FRR routes by streaming vtysh TEXT output line-by-line.

FRR zebra_vty.c assembles each route line as three fixed-position characters:
[0] protocol char : 'B' for BGP (zebra_route_char)
[1] selected char : '>' if ZEBRA_FLAG_SELECTED, else ' '
[2] status char : single char from re_status_output_char():
'q' ROUTE_ENTRY_QUEUED (suppress-fib-pending)
'o' ZEBRA_FLAG_OFFLOAD_FAILED
't' ZEBRA_FLAG_TRAPPED
'*' nexthop active / installed
' ' none of the above
Examples:
B>* 10.0.0.0/24 selected, offloaded (healthy)
B>q 10.0.0.0/24 selected, queued (suppress-fib-pending)
B>o 10.0.0.0/24 selected, offload failed
The status char is always at position [2], never before '>'.
Only lines with 'q' or 'o' at position [2] are collected; on a healthy
device the output is scanned and discarded without any allocations.

Returns lists of plain prefix strings. The protocol is always 'bgp' for
these routes; callers that need it (e.g. mitigate_installed_not_offloaded_frr_routes)
hardcode that value rather than carrying it in the list entries.
"""
import re
missing_routes = []
failing_routes = []

asic_id = []
if namespace is not multi_asic.DEFAULT_NAMESPACE:
asic_id = ['-n', str(multi_asic.get_asic_id_from_name(namespace))]

if ipv6:
cmd = ["sudo", "vtysh"] + asic_id + ["-c", "show ipv6 route json"]
else:
cmd = ["sudo", "vtysh"] + asic_id + ["-c", "show ip route json"]

def process_route_entry(prefix, route_entry):
"""Process a single route entry and add to missing_routes if needed."""
if route_entry.get('protocol') in ('connected', 'kernel', 'static'):
return
if route_entry.get('vrfName') != 'default':
return
# skip if this bgp source prefix is not selected as best
if not route_entry.get('selected', False):
return
if not route_entry.get('offloaded', False):
missing_routes.append(prefix)
if route_entry.get('failed', False):
failing_routes.append(prefix)
af = 'ipv6' if ipv6 else 'ip'
cmd = ["sudo", "vtysh"] + asic_id + ["-c", f"show {af} route"]

try:
with subprocess.Popen(cmd, stdout=subprocess.PIPE, bufsize=0) as proc:
try:
# Use ijson to parse the JSON stream incrementally
# kvitems('') iterates over key-value pairs at the root level
# This gives us each prefix and its route entries as they become available
for prefix, route_entries in ijson.kvitems(proc.stdout, ''):
# Process each route entry for this prefix
if isinstance(route_entries, list):
for route_entry in route_entries:
process_route_entry(prefix, route_entry)
else:
# Handle case where route_entries is not a list (shouldn't happen with valid FRR output)
print_message(syslog.LOG_WARNING, f"Unexpected route entry format for prefix {prefix}")

except ijson.JSONError as e:
# Handle JSON parsing errors
print_message(syslog.LOG_WARNING, f"Failed to parse JSON stream: {e}")
except UnicodeDecodeError as e:
# Handle UTF-8 decoding errors
print_message(syslog.LOG_WARNING, f"UTF-8 decoding error: {e}")
except Exception as e:
# Handle any other unexpected errors during parsing
print_message(syslog.LOG_WARNING, f"Error during JSON parsing: {e}")
# Match selected BGP routes with a non-offloaded status flag:
# B>q prefix -- queued (suppress-fib-pending)
# B>o prefix -- offload failed
# Format is always: proto_char + selected_char(>) + status_char + space + prefix
pat = re.compile(r'^B>([qo])\s+(\S+)\s')

# Wait for the process to terminate and get the return code
try:
with subprocess.Popen(cmd, stdout=subprocess.PIPE,
stderr=subprocess.DEVNULL,
bufsize=1, text=True) as proc:
for line in proc.stdout:
m = pat.match(line)
if not m:
continue
flag = m.group(1)
prefix = m.group(2)
if flag == 'q':
missing_routes.append(prefix)
if flag == 'o':
failing_routes.append(prefix)
return_code = proc.wait()
if return_code != 0:
print_message(syslog.LOG_WARNING, f"Subprocess exited with non-zero return code: {return_code}")

print_message(syslog.LOG_WARNING,
f"vtysh exited with non-zero return code: {return_code}")
except FileNotFoundError:
print_message(syslog.LOG_ERR, f"Error: Command '{cmd[0]}' not found.")
except Exception as e:
print_message(syslog.LOG_ERR, f"An error occurred: {e}")
print_message(syslog.LOG_ERR, f"fetch_routes error: {e}")

return missing_routes, failing_routes

Expand Down Expand Up @@ -733,20 +730,46 @@ def is_feature_bgp_enabled(namespace):

def check_frr_pending_routes(namespace):
"""
Check FRR routes for offload flag presence by executing "show ip route json"
Returns a list of routes that have no offload flag.
Check FRR routes for offload flag presence by streaming vtysh text output.
Returns lists of route dicts {'prefix', 'protocol'} that are persistently
non-offloaded across all retry iterations (intersection logic).

Intersection approach: only routes that appear as stuck in *every* poll
are returned for mitigation. A route that clears between iterations is
converging normally and should not be mitigated.

Example with FRR_CHECK_RETRIES=3:
iter 0: {A, B, C, D}
iter 1: {A, B, C} (D converged)
iter 2: {A, B} (C converged)
result: {A, B} <- only truly stuck routes
"""
acc_miss = []
acc_fail = []

missed_rt = []
failed_rt = []
retries = FRR_CHECK_RETRIES
for i in range(retries):
missed_rt, failed_rt = get_frr_routes_parallel(namespace)
for i in range(FRR_CHECK_RETRIES):
curr_miss, curr_fail = get_frr_routes_parallel(namespace)

if not missed_rt and not failed_rt:
if i == 0:
# Seed: first iteration defines the candidate set
acc_miss = curr_miss
acc_fail = curr_fail
else:
# Intersect: keep only prefixes still stuck in this iteration
curr_miss_set = set(curr_miss)
curr_fail_set = set(curr_fail)
acc_miss = [e for e in acc_miss if e in curr_miss_set]
acc_fail = [e for e in acc_fail if e in curr_fail_set]

if not curr_miss and not curr_fail:
# Nothing queued right now; intersection already empty -- stop early
break

time.sleep(FRR_WAIT_TIME)
if i < FRR_CHECK_RETRIES - 1:
time.sleep(FRR_WAIT_TIME)

missed_rt = acc_miss
failed_rt = acc_fail
print_message(syslog.LOG_DEBUG, f"FRR missed routes: {missed_rt}")
print_message(syslog.LOG_DEBUG, f"FRR failed routes: {failed_rt}")
return missed_rt, failed_rt
Expand All @@ -766,11 +789,11 @@ def mitigate_installed_not_offloaded_frr_routes(namespace, missed_frr_rt, rt_app
"""
db = swsscommon.DBConnector('APPL_STATE_DB', REDIS_TIMEOUT_MSECS, True, namespace)
response_producer = swsscommon.NotificationProducer(db, f'{APPL_DB_NAME}_{swsscommon.APP_ROUTE_TABLE_NAME}_RESPONSE_CHANNEL')
for entry in [entry for entry in missed_frr_rt if entry['prefix'] in rt_appl]:
fvs = swsscommon.FieldValuePairs([('err_str', 'SWSS_RC_SUCCESS'), ('protocol', entry['protocol'])])
response_producer.send('SWSS_RC_SUCCESS', entry['prefix'], fvs)
for prefix in [prefix for prefix in missed_frr_rt if prefix in rt_appl]:
fvs = swsscommon.FieldValuePairs([('err_str', 'SWSS_RC_SUCCESS'), ('protocol', 'bgp')])
response_producer.send('SWSS_RC_SUCCESS', prefix, fvs)

print_message(syslog.LOG_ERR, f'Mitigated route {entry["prefix"]}', write_to_stdout=False)
print_message(syslog.LOG_ERR, f'Mitigated route {prefix}', write_to_stdout=False)


def get_soc_ips(config_db):
Expand Down Expand Up @@ -888,6 +911,8 @@ def check_routes_for_namespace(namespace):
rt_frr_miss = []
rt_frr_failed = []

rt_frr_miss, rt_frr_failed = check_frr_pending_routes(namespace)

selector, subs, rt_asic = get_asicdb_routes(namespace)

rt_appl = get_appdb_routes(namespace)
Expand Down Expand Up @@ -945,8 +970,6 @@ def check_routes_for_namespace(namespace):
if rt_asic_miss:
results["Unaccounted_ROUTE_ENTRY_TABLE_entries"] = rt_asic_miss

rt_frr_miss, rt_frr_failed = check_frr_pending_routes(namespace)

if rt_frr_miss:
results["missed_FRR_routes"] = rt_frr_miss

Expand Down
Loading
Loading